diff options
author | Georgios Pinitas <georgios.pinitas@arm.com> | 2019-01-23 11:24:50 +0000 |
---|---|---|
committer | Michalis Spyrou <michalis.spyrou@arm.com> | 2019-01-24 10:19:46 +0000 |
commit | 1d480652b820317fc97ccbc3cb517e3b9e8be197 (patch) | |
tree | b3c845ec02cccf89430b95186ed3e3f2ae65e2bd /src/core | |
parent | 20b527a7029d02d0edda78fd92002cbc430dbe05 (diff) | |
download | ComputeLibrary-1d480652b820317fc97ccbc3cb517e3b9e8be197.tar.gz |
COMPMID-1867: Add u8 and s8 hybrid assembly kernels.
Change-Id: Ifeb005f9d18d19feff11949474cce84d9e03749c
Reviewed-on: https://review.mlplatform.org/565
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core')
39 files changed, 8310 insertions, 322 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp index 09f03c6332..c2bd0bb882 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp @@ -28,6 +28,7 @@ #include <algorithm> #include "arm_gemm.hpp" +#include "ndrange.hpp" #include "utils.hpp" #include "mergeresults.hpp" @@ -60,69 +61,66 @@ class GemmHybrid : public GemmCommon<To, Tr> { const Tr _beta; /* Blocking info */ - unsigned int _k_block=0; - unsigned int _x_block=0; - unsigned int _Mround=0; + const unsigned int _k_block; + const unsigned int _n_block; + const unsigned int _Mround; /* Pretransposed buffer. */ const Toi *_B_transposed=nullptr; - unsigned int _B_per_multi = 0; + const NDRange<4> _window_range; - /* We will need to walk through the blocks of B in a few contexts, so - * factor that out. */ - class blockwalker { - private: - /* Size loops, etc. based on our parent's configuration */ - const GemmHybrid<strategy, To, Tr> &_parent; + static unsigned int compute_k_block(const GemmArgs<Tr> &args) { + if (args._cfg && args._cfg->inner_block_size) { + return args._cfg->inner_block_size; + } - /* K, X and multi parameters for current iteration. */ - unsigned int _k0=0, _x0=0; + const unsigned int L1_size = args._ci->get_L1_cache_size(); - unsigned int _index=0; - bool _done=false; - bool _newkblock=true; + // k_block: Find out how much of the larger array can be loaded into half the cache. + // This should account for associative caches. + unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); - public: - blockwalker(const GemmHybrid<strategy, To, Tr> &parent) : _parent(parent) { } + // Needs to be (at least a single) multiple of the K unroll level. + k_block /= strategy::k_unroll(); + k_block = std::max(k_block, 1U) * strategy::k_unroll(); - unsigned int xmax() { - return std::min(_x0 + _parent._x_block, _parent._Nsize); - } + // Now tune to presented problem size; this is how many blocks we need. + unsigned int numk_blocks = iceildiv(args._Ksize, k_block); - unsigned int kmax() { - return std::min(_k0 + _parent._k_block, _parent._Ksize); - } + // So divide the space equally into that many blocks. + k_block = iceildiv(args._Ksize, numk_blocks); - /* Advance to the next block, return false at the end. */ - bool advance(void) { - if (_done) { - return false; - } + // And round UP to the K unroll level required. + k_block = roundup(k_block, strategy::k_unroll()); - _newkblock=false; - _x0 += _parent._x_block; - if (_x0 >= _parent._Nsize) { - _x0=0; - _k0 += _parent._k_block; - if (_k0 >= _parent._Ksize) { - _done=true; - return false; - } - _newkblock=true; - } - _index++; + return k_block; + } - return true; + static unsigned int compute_n_block(const GemmArgs<Tr> &args) { + if (args._cfg && args._cfg->outer_block_size) { + return args._cfg->outer_block_size; } - unsigned int k0(void) { return _k0; } - unsigned int x0(void) { return _x0; } - unsigned int index(void) { return _index; } - bool done(void) { return _done; } - bool newkblock(void) { return _newkblock; } - }; + const unsigned int k_block = compute_k_block(args); + const unsigned int L2_size = args._ci->get_L2_cache_size(); + // n_block: Work out how many rows (of length k_block) will fit in the L2 + // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. + unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / + (sizeof(Toi) * k_block); + + // Needs to be (at least a single) multiple of the kernel output width. + n_block /= strategy::out_width(); + n_block = std::max(n_block, 1U) * strategy::out_width(); + + // And tune to the presented problem size. + unsigned int numblocks = iceildiv(args._Nsize, n_block); + n_block = iceildiv(args._Nsize, numblocks); + n_block = roundup(n_block, strategy::out_width()); + + return n_block; + } public: GemmHybrid(GemmHybrid &) = delete; @@ -130,71 +128,20 @@ public: /* Constructor */ GemmHybrid(const GemmArgs<Tr> &args) - : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), _nbatches(args._nbatches), - _nmulti(args._nmulti), _trB(args._trB), _beta(args._beta) { - const unsigned int L1_size = _ci->get_L1_cache_size(); - const unsigned int L2_size = _ci->get_L2_cache_size(); - - _B_per_multi = (iceildiv(_Nsize, strategy::out_width()) * strategy::out_width()) * - (iceildiv(_Ksize, strategy::k_unroll()) * strategy::k_unroll()); - - // Work out blocking parameters, or override from config. - - if (args._cfg && args._cfg->inner_block_size) { - _k_block = args._cfg->inner_block_size; - } else { - // k_block: Find out how much of the larger array can be loaded into half the cache. - // This should account for associative caches. - _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); - - // Needs to be (at least a single) multiple of the K unroll level. - _k_block /= strategy::k_unroll(); - _k_block = std::max(_k_block, 1U) * strategy::k_unroll(); - - // Now tune to presented problem size; this is how many blocks we need. - int num_k_blocks = iceildiv(_Ksize, _k_block); - - // So divide the space equally into that many blocks. - _k_block = iceildiv(_Ksize, num_k_blocks); - - // And round UP to the K unroll level required. - _k_block = iceildiv(_k_block, strategy::k_unroll()); - _k_block *= strategy::k_unroll(); - } - - if (args._cfg && args._cfg->outer_block_size) { - _x_block = args._cfg->outer_block_size; - } else { - // x_block: Work out how many rows (of length k_block) will fit in the L2 - // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. - _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / - (sizeof(Toi) * _k_block); - - // Needs to be (at least a single) multiple of the kernel output width. - _x_block /= strategy::out_width(); - _x_block = std::max(_x_block, 1U) * strategy::out_width(); - - // And tune to the presented problem size. - int num_x_blocks = iceildiv(_Nsize, _x_block); - _x_block = iceildiv(_Nsize, num_x_blocks); - - _x_block = iceildiv(_x_block, strategy::out_width()); - _x_block *= strategy::out_width(); - } - - // Work out the rounded size of M - needed for some buffers. - _Mround = iceildiv(_Msize, strategy::out_height()); - _Mround *= strategy::out_height(); - } + : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), + _nbatches(args._nbatches), _nmulti(args._nmulti), _trB(args._trB), _beta(args._beta), + _k_block(compute_k_block(args)), _n_block(compute_n_block(args)), + _Mround(roundup(args._Msize, strategy::out_height())), + _window_range(iceildiv(args._Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmulti) { } // Interface implementation - Compulsory functions - - // Window size: Only the last thread should do a ragged block, so dole - // out work in units of out_height. Factor batches and multi into the - // window too. unsigned int get_window_size() const override { - // _Mround is a multiple of out_height by definition. - return (_Mround / strategy::out_height()) * _nbatches * _nmulti; + return _window_range.total_size(); + } + + // This kernel can always be dynamically scheduled. + bool supports_dynamic_scheduling() const override { + return true; } // Execute @@ -206,50 +153,45 @@ public: /* Make sure we've been set up correctly. */ assert(_B_transposed); - - const unsigned int window_per_batch = iceildiv(_Msize, strategy::out_height()); - const unsigned int window_per_multi = window_per_batch * _nbatches; - - const unsigned int first_multi = start / window_per_multi; - const unsigned int last_multi = end / window_per_multi; - - const unsigned int first_batch = (start - (first_multi * window_per_multi)) / window_per_batch; - const unsigned int last_batch = (end - (last_multi * window_per_multi)) / window_per_batch; - - const unsigned int first_row = ((start - (first_multi * window_per_multi)) % window_per_batch) * strategy::out_height(); - const unsigned int last_row = ((end - (last_multi * window_per_multi)) % window_per_batch) * strategy::out_height(); - static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same."); static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same."); - for (unsigned int multi = first_multi; multi <= last_multi; multi++) { - const unsigned int batch_0 = (multi == first_multi) ? first_batch : 0; - const unsigned int batch_max = (multi == last_multi) ? last_batch : (_nbatches - 1); + /* For now, each work item implies all the K for a given output + * pixel (so we don't need to synchronize access to the output + * array). So separate the loop over K blocks here. */ + for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) { + unsigned int kmax = std::min(k0 + _k_block, _Ksize); + unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll()); - const Toi *b_panel = _B_transposed + (multi * _B_per_multi); + auto p = _window_range.iterator(start, end); - for (blockwalker current(*this); !current.done(); current.advance()) { - int kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll()); - kern_k *= strat.k_unroll(); + if (p.done()) { + return; + } - int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width()); + do { + const unsigned int m_start = p.dim(0) * strategy::out_height(); + const unsigned int m_end = std::min(p.dim0_max() * strategy::out_height(), _Msize); + const unsigned int batch = p.dim(1); + const unsigned int n0 = p.dim(2) * _n_block; + const unsigned int nmax = std::min(n0 + _n_block, _Nsize); + const unsigned int multi = p.dim(3); + + const Toi *b_panel = _B_transposed + + (multi * roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll())) + + (k0 * roundup(_Nsize, strategy::out_width())) + + (n0 * kern_k); - for (unsigned int batch = batch_0; batch <= batch_max; batch++) { - const unsigned int m_start = ((multi == first_multi) && (batch == first_batch)) ? first_row : 0; - const unsigned int m_end = ((multi == last_multi) && (batch == last_batch) ) ? last_row : _Msize; #ifdef CYCLE_PROFILING - auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * bblocks * strategy::out_width()); + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width())); #endif - strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + current.k0(), this->_lda, - b_panel, - this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + current.x0(), this->_ldc, - (current.k0() == 0) ? _beta : static_cast<Tr>(1), - (m_end - m_start), (current.xmax() - current.x0()), kern_k); - } - - b_panel += (bblocks * strat.out_width() * kern_k); - } + strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda, + b_panel, + this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc, + (k0 == 0) ? _beta : static_cast<Tr>(1), + (m_end - m_start), (nmax - n0), kern_k); + } while (p.next_dim1()); } } @@ -263,35 +205,31 @@ public: } size_t get_B_pretransposed_array_size() const override { - return _B_per_multi * _nmulti * sizeof(Toi); + return roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi); } + using GemmCommon<To, Tr>::pretranspose_B_array; void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { Toi *buffer = reinterpret_cast<Toi *>(in_buffer); _B_transposed = buffer; strategy strat(_ci); - for (unsigned int multi=0; multi < _nmulti; multi++) { - blockwalker current(*this); - - do { - /* Figure out the size of each block. */ - size_t x_size = (current.xmax() - current.x0()); - size_t k_size = (current.kmax() - current.k0()); + for (unsigned int multi=0; multi<_nmulti; multi++) { + for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) { + const unsigned int kmax = std::min(k0 + _k_block, _Ksize); + const unsigned int k_size = roundup(kmax-k0, strategy::k_unroll()); - /* Round sizes up as needed. */ - x_size = iceildiv(x_size, strategy::out_width()); - x_size *= strategy::out_width(); + for (unsigned int x0=0; x0<_Nsize; x0+=_n_block) { + const unsigned int xmax = std::min(x0+_n_block, _Nsize); - k_size = iceildiv(k_size, strategy::k_unroll()); - k_size *= strategy::k_unroll(); + const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size; - strat.transforms.PrepareB( - buffer, B + (multi * B_multi_stride), ldb, - current.x0(), current.xmax(), current.k0(), current.kmax(), _trB); + strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb, + x0, xmax, k0, kmax, _trB); - buffer += (x_size * k_size); - } while (current.advance()); + buffer += size; + } + } } } diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp index 34dc8bc341..5811c2a1ce 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,6 +25,7 @@ #include "arm_gemm.hpp" #include "gemm_common.hpp" +#include "gemm_hybrid.hpp" #include "gemm_implementation.hpp" #include "gemm_interleaved.hpp" #include "gemm_native.hpp" @@ -32,6 +33,7 @@ #include "kernels/a64_gemm_s16_12x8.hpp" #include "kernels/a64_gemm_s8_12x8.hpp" #include "kernels/a64_gemm_s8_4x4.hpp" +#include "kernels/a64_hybrid_s8s32_dot_16x4.hpp" #include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp" #include "kernels/sve_native_s8s32_dot_4VLx4.hpp" @@ -55,6 +57,13 @@ static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = { }, #endif { + GemmMethod::GEMM_HYBRID, + "hybrid_s8s32_dot_16x4", + [](const GemmArgs<int32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && (args._Ksize % 16 == 0) && (args._Nsize % 16 == 0) && !args._trA && !args._trB && args._pretransposed_hint; }, + [](const GemmArgs<int32_t> &args) { return args._Nsize<=256 && args._Ksize>128; }, + [](const GemmArgs<int32_t> &args) { return new GemmHybrid<hybrid_s8s32_dot_16x4, int8_t, int32_t>(args); } +}, +{ GemmMethod::GEMM_INTERLEAVED, "gemm_s8_12x8", [](const GemmArgs<int32_t> &args) { return args._ci->has_dotprod(); }, diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp index 436438f351..b83ccd3407 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp @@ -340,7 +340,7 @@ public: _k_block = std::max(_k_block, 1U) * strategy::k_unroll(); // Now tune to presented problem size; this is how many blocks we need. - int num_k_blocks = iceildiv(_Ksize, _k_block); + unsigned int num_k_blocks = iceildiv(_Ksize, _k_block); // So divide the space equally into that many blocks. _k_block = iceildiv(_Ksize, num_k_blocks); @@ -363,7 +363,7 @@ public: _x_block = std::max(_x_block, 1U) * strategy::out_width(); // And tune to the presented problem size. - int num_x_blocks = iceildiv(_Nsize, _x_block); + unsigned int num_x_blocks = iceildiv(_Nsize, _x_block); _x_block = iceildiv(_Nsize, num_x_blocks); _x_block = iceildiv(_x_block, strategy::out_width()); @@ -464,8 +464,8 @@ public: do { /* Figure out the size of each block. */ - size_t x_size = (current.xmax() - current.x0()); - size_t k_size = (current.kmax() - current.k0()); + unsigned int x_size = (current.xmax() - current.x0()); + unsigned int k_size = (current.kmax() - current.k0()); /* Round sizes up as needed. */ x_size = iceildiv(x_size, strategy::out_width()); @@ -480,6 +480,7 @@ public: return total; } + using GemmCommon<To, Tr>::pretranspose_B_array; void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { blockwalker current(*this); Toi *buffer = reinterpret_cast<Toi *>(in_buffer); @@ -488,8 +489,8 @@ public: do { /* Figure out the size of each block. */ - size_t x_size = (current.xmax() - current.x0()); - size_t k_size = (current.kmax() - current.k0()); + unsigned int x_size = (current.xmax() - current.x0()); + unsigned int k_size = (current.kmax() - current.k0()); /* Round sizes up as needed. */ x_size = iceildiv(x_size, strategy::out_width()); diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp index 579533418d..98516b1ca6 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,8 +27,7 @@ #include "arm_gemm.hpp" -#include "mergeresults.hpp" -#include "transform.hpp" +#include "ndrange.hpp" #ifdef CYCLE_PROFILING #include "profiler.hpp" @@ -55,19 +54,25 @@ class GemmNative : public GemmCommon<To, Tr> { const unsigned int _nbatches; const unsigned int _nmultis; - Tr _beta; + const Tr _beta; const CPUInfo * const _ci; - unsigned int k_block=0; - unsigned int n_block=0; + const unsigned int _k_block; + const unsigned int _n_block; - unsigned int window_per_batch() const { - return iceildiv(_Msize, strategy::out_height()); + const NDRange<4> _window_range; + + static unsigned int compute_k_block(const GemmArgs<Tr> &args) { + return args._Ksize; } - unsigned int window_per_multi() const { - return window_per_batch() * _nbatches; + static unsigned int compute_n_block(const GemmArgs<Tr> &args) { + if ((args._cfg != nullptr) && args._cfg->outer_block_size > 0) { + return args._cfg->outer_block_size; + } else { + return args._Nsize; + } } public: @@ -75,15 +80,20 @@ public: GemmNative & operator= (GemmNative &) = delete; GemmNative(const GemmArgs<Tr> &args) - : _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), _nbatches(args._nbatches), _nmultis(args._nmulti), _beta(args._beta), _ci(args._ci) { - /* For now don't do any blocking. TODO: figure out if we should. */ - k_block = _Ksize; - n_block = _Nsize; - } + : _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), + _nbatches(args._nbatches), _nmultis(args._nmulti), + _beta(args._beta), _ci(args._ci), + _k_block(compute_k_block(args)), _n_block(compute_n_block(args)), + _window_range(iceildiv(_Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmultis) { } // Window is amount per multi multiplied by total number of multis. unsigned int get_window_size() const override { - return window_per_multi() * _nmultis; + return _window_range.total_size(); + } + + // Native GEMMs can always be dynamically scheduled (whether requested or not) + bool supports_dynamic_scheduling() const override { + return true; } // Actually execute the GEMM. @@ -96,45 +106,30 @@ public: static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same."); static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same."); - /* Compute starting point based on 'start' */ - unsigned int multi = start / window_per_multi(); - unsigned int multi_pos = start % window_per_multi(); + auto p = _window_range.iterator(start, end); - unsigned int batch = multi_pos / window_per_batch(); - unsigned int batch_pos = multi_pos % window_per_batch(); - - unsigned int y0 = batch_pos * strategy::out_height(); - - for (unsigned int l=end-start; l>0; ) { - // Do work from here to the end of the current batch/multi - const unsigned int ymax = std::min(y0 + (l * strategy::out_height()), _Msize); + if (p.done()) { + return; + } - // Work out how many units this is and subtract from loop counter. - l -= ((ymax - y0) + (strategy::out_height() - 1)) / strategy::out_height(); + do { + unsigned int y0 = p.dim(0) * strategy::out_height(); + unsigned int ymax = std::min(p.dim0_max() * strategy::out_height(), _Msize); + unsigned int batch = p.dim(1); + unsigned int n0 = p.dim(2) * _n_block; + unsigned int nmax = std::min(n0 + _n_block, _Nsize); + unsigned int multi = p.dim(3); #ifdef CYCLE_PROFILING - auto p = prof.ScopedProfiler(PROFILE_KERNEL, (ymax-y0) * _Nsize * _Ksize); + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (ymax-y0) * (nmax - n0) * _Ksize); #endif strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (y0 * this->_lda), this->_lda, - this->_Bptr + (multi * this->_B_multi_stride), this->_ldb, - this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (y0 * this->_ldc), this->_ldc, - _beta, (ymax-y0), _Nsize, _Ksize); - - /* Advance to next item */ - y0 = ymax; - - /* Check for batch/multi overflow */ - if (y0 >= _Msize) { - y0=0; - batch++; - if (batch == _nbatches) { - batch=0; - multi++; - } - } - } + this->_Bptr + (multi * this->_B_multi_stride) + n0, this->_ldb, + this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (y0 * this->_ldc) + n0, this->_ldc, + _beta, (ymax-y0), (nmax - n0), _Ksize); + } while (p.next_dim1()); } }; -} // namespace arm_gemm +} // namespace arm_gemm
\ No newline at end of file diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp index 3c8df3f044..b95ca8016b 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,11 +27,13 @@ #include "gemm_common.hpp" #include "gemm_implementation.hpp" #include "gemm_interleaved.hpp" +#include "gemm_hybrid.hpp" #include "gemm_native.hpp" #include "kernels/a64_gemm_u16_12x8.hpp" #include "kernels/a64_gemm_u8_12x8.hpp" #include "kernels/a64_gemm_u8_4x4.hpp" +#include "kernels/a64_hybrid_u8u32_dot_16x4.hpp" #include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp" #include "kernels/sve_native_u8u32_dot_4VLx4.hpp" @@ -55,6 +57,13 @@ static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = { }, #endif { + GemmMethod::GEMM_HYBRID, + "hybrid_u8u32_dot_16x4", + [](const GemmArgs<uint32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && (args._Ksize % 16 == 0) && (args._Nsize % 16 == 0) && !args._trA && !args._trB && args._pretransposed_hint; }, + [](const GemmArgs<uint32_t> &args) { return args._Nsize<=256 && args._Ksize>128; }, + [](const GemmArgs<uint32_t> &args) { return new GemmHybrid<hybrid_u8u32_dot_16x4, uint8_t, uint32_t>(args); } +}, +{ GemmMethod::GEMM_INTERLEAVED, "gemm_u8_12x8", [](const GemmArgs<uint32_t> &args) { return args._ci->has_dotprod(); }, diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp index 40f7f2b7cd..32d668f66d 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp @@ -44,6 +44,7 @@ public: _subgemm = gemm<To,Tr>(newargs); } + using GemmCommon<To, Tr>::set_arrays; void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride, const To *B, const int ldb, const int B_multi_stride, Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) override { @@ -85,6 +86,7 @@ public: return _subgemm->get_B_pretransposed_array_size(); } + using GemmCommon<To, Tr>::pretranspose_B_array; void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override { _subgemm->pretranspose_B_array(buffer, B, ldb, B_multi_stride); } diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp index 5cf42761e6..5ebc6342d7 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp @@ -73,7 +73,7 @@ public: // Window is number of out_width blocks times number of multis. unsigned int get_window_size() const override { - return iceildiv(_Nsize, strategy::out_width) * _nmultis; + return iceildiv(_Nsize, strategy::out_width()) * _nmultis; } // Actually execute the GEMV. @@ -83,12 +83,12 @@ public: #endif strategy strat(_ci); - const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width); + const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width()); const unsigned int multi_0 = start / window_per_multi; const unsigned int multi_end = end / window_per_multi; - const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width; - const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width; + const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width(); + const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width(); static_assert(std::is_same<To, Toi>::value, "gemv_transposed: Operand types must be the same."); static_assert(std::is_same<Tr, Tri>::value, "gemv_transposed: Result types must be the same."); diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp index 842339ef23..f7beb0a34c 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp @@ -70,7 +70,7 @@ public: GemvPretransposed(const GemmArgs<Tr> &args) : _Nsize(args._Nsize), _Ksize(args._Ksize), _nmultis(args._nmulti), _trB(args._trB), _beta(args._beta), _ci(args._ci), - _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave) * strategy::A_interleave) { + _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave()) * strategy::A_interleave()) { /* For now don't do any blocking. TODO: figure out if we should. */ if (args._cfg && args._cfg->inner_block_size) { m_block = args._cfg->inner_block_size; @@ -87,7 +87,7 @@ public: // Window is number of out_width blocks, times number of multis. unsigned int get_window_size() const override { - return iceildiv(_Nsize, strategy::out_width) * _nmultis; + return iceildiv(_Nsize, strategy::out_width()) * _nmultis; } // Actually execute the GEMV. @@ -98,13 +98,13 @@ public: strategy strat(_ci); /* Break the window values down into multis of interest... */ - const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width); + const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width()); const unsigned int multi_0 = start / window_per_multi; const unsigned int multi_end = end / window_per_multi; /* ... and figure out where we start and end in the first and last multi. */ - const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width; - const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width; + const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width(); + const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width(); static_assert(std::is_same<Tr, Tri>::value, "GemvPretransposed: Result types must be the same."); @@ -124,8 +124,8 @@ public: auto p = prof.ScopedProfiler(PROFILE_KERNEL, (mmax-m0) * (nmax-n)); #endif /* This assumes that the underlying call was a GEMM with M=1; for the N=1 case we would have to pick up this->_Bptr below instead */ - strat.kernel(_A_pretransposed + (multi * _buffer_per_multi) + (n * _Ksize) + (m0 * strategy::A_interleave), - (_Ksize * strategy::A_interleave), + strat.kernel(_A_pretransposed + (multi * _buffer_per_multi) + (n * _Ksize) + (m0 * strategy::A_interleave()), + (_Ksize * strategy::A_interleave()), this->_Aptr + (multi * this->_A_multi_stride) + m0, this->_Cptr + (multi * this->_C_multi_stride) + n, _beta, (mmax-m0), (nmax-n)); @@ -148,6 +148,7 @@ public: return _buffer_per_multi * _nmultis * sizeof(To); } + using GemmCommon<To, Tr>::pretranspose_B_array; void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override { Toi *A_buffer = reinterpret_cast<Toi *>(buffer); @@ -155,10 +156,10 @@ public: /* Reverse sense here as we are dealing with B rather than A. So if * strategy::A_transpose is false and _trB is false, we still * transpose. */ - if (_trB ^ strategy::A_transpose) { - Transform<strategy::A_interleave, strategy::A_block, false>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize); + if (_trB ^ strategy::A_transpose()) { + Transform<strategy::A_interleave(), strategy::A_block(), false>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize); } else { - Transform<strategy::A_interleave, strategy::A_block, true>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize); + Transform<strategy::A_interleave(), strategy::A_block(), true>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize); } } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp index 06e62456dc..234972270c 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -50,15 +50,15 @@ public: typedef void (*kern_type)(const float *, const float *, float *, int, int, int); /* Kernel blocking parameters */ - static int out_width() { + static unsigned int out_width() { return 8; } - static int out_height() { + static unsigned int out_height() { return 6; } - static int k_unroll() { + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp index 95a2bc2fbc..2fcb587df1 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -48,15 +48,15 @@ public: typedef void (*kern_type)(const int16_t *, const int16_t *, int32_t *, int, int, int); /* Kernel blocking parameters */ - static int out_width() { + static unsigned int out_width() { return 12; } - static int out_height() { + static unsigned int out_height() { return 8; } - static int k_unroll() { + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp index fdc0200435..cc205dc6e3 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -43,15 +43,15 @@ public: typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); /* Kernel blocking parameters */ - static int out_width() { + static unsigned int out_width() { return 12; } - static int out_height() { + static unsigned int out_height() { return 8; } - static int k_unroll() { + static unsigned int k_unroll() { return 4; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp index be7ead9f48..71c666ad00 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -42,15 +42,15 @@ public: typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); /* Kernel blocking parameters */ - static int out_width() { + static unsigned int out_width() { return 4; } - static int out_height() { + static unsigned int out_height() { return 4; } - static int k_unroll() { + static unsigned int k_unroll() { return 16; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp index d2692ba77f..3d5c92c622 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -48,15 +48,15 @@ public: typedef void (*kern_type)(const uint16_t *, const uint16_t *, uint32_t *, int, int, int); /* Kernel blocking parameters */ - static int out_width() { + static unsigned int out_width() { return 12; } - static int out_height() { + static unsigned int out_height() { return 8; } - static int k_unroll() { + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp index a252abfd3e..9032ba67b3 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -53,15 +53,15 @@ public: static const bool B_transpose = true; /* Kernel blocking parameters */ - static int out_width() { + static unsigned int out_width() { return 12; } - static int out_height() { + static unsigned int out_height() { return 8; } - static int k_unroll() { + static unsigned int k_unroll() { return 4; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp index 2da3ecd4f8..fda7657b2b 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -50,15 +50,15 @@ public: static const bool B_transpose = true; /* Kernel blocking parameters */ - static int out_width() { + static unsigned int out_width() { return 4; } - static int out_height() { + static unsigned int out_height() { return 4; } - static int k_unroll() { + static unsigned int k_unroll() { return 16; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp index 911a4ebb01..5b850b7a20 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -47,15 +47,15 @@ public: typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int); /* Kernel blocking parameters */ - static int out_width() { + static unsigned int out_width() { return 24; } - static int out_height() { + static unsigned int out_height() { return 8; } - static int k_unroll() { + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp new file mode 100644 index 0000000000..c8934dff8a --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include <cstdint> +#include "../std_transforms_fixed.hpp" + +namespace arm_gemm +{ + +// Actual kernel implementations +void a64_hybrid_s8s32_dot_16x4(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int); +void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int); + +class hybrid_s8s32_dot_16x4 +{ +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_height() + { + return 4; + } + + static unsigned int out_width() + { + return 16; + } + + static unsigned int k_unroll() + { + return 4; + } + + StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_hybrid_s8s32_dot_16x4; + + hybrid_s8s32_dot_16x4(const CPUInfo *ci) + { + if (ci->get_cpu_model() == CPUModel::A55r1) { + kernel = a64_hybrid_s8s32_dot_16x4_a55; + } + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp new file mode 100644 index 0000000000..48bf842ca5 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp @@ -0,0 +1,2271 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include <algorithm> + +#include <cstdint> +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int32_t beta, int M, int N, int K) { + const long beta0 = (beta == 0); + const int K_stride = ((K + 3) / 4) * 4; + const long loops_count = ((K + 16) / 32) - 1; + K -= loops_count * 32; + const long regs_count = (K / 16) - 1; + + for (int y=0; y<M; y+=4) { + const int8_t * const a_ptr0_base = A + (y * lda); + const unsigned long ldab = lda * sizeof(int8_t); + + int32_t *c_ptr0 = C + (y * ldc); + const unsigned long ldcb = ldc * sizeof(int32_t); + + for (int x0=0; x0<N; x0+=16ul) { + const long width = std::min((unsigned long)N-x0, 16ul); + const int32_t *betaptr = β + long loops = loops_count; + long regs = regs_count; + const int8_t *a_ptr0 = a_ptr0_base; + const int8_t *b_ptr0 = B + (K_stride * x0); + + switch(M-y) { + case 1: + __asm __volatile ( + "temploadreg0 .req X0\n" + "temploadreg1 .req X1\n" + "temploadreg2 .req X2\n" + "temploadreg3 .req X3\n" + "cbz %[beta0], 1f\n" + "movi v16.4s, #0\n" + "ldr q0, [%[a_ptr0]]\n" + "movi v17.4s, #0\n" + "ldr q8, [%[b_ptr0]]\n" + "movi v18.4s, #0\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "movi v19.4s, #0\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1r {v15.4s}, [%[betaptr]]\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "mul v16.4s, v16.4s, v15.4s\n" + "ldr q0, [%[a_ptr0]]\n" + "mul v17.4s, v17.4s, v15.4s\n" + "ldr q8, [%[b_ptr0]]\n" + "mul v18.4s, v18.4s, v15.4s\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "mul v19.4s, v19.4s, v15.4s\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "3:\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + "subs %[loops], %[loops], #0x1\n" + "ins v4.d[1], temploadreg0\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "ins v15.d[1], temploadreg3\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + "ins v12.d[1], temploadreg0\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + "ins v13.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + "ldr d0, [%[a_ptr0], #-0x10]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" + "ldr d8, [%[b_ptr0]]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" + "ins v0.d[1], temploadreg0\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + "ins v13.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + "ins v12.d[1], temploadreg0\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + "ins v13.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + "ins v9.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + "ins v10.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + "ins v11.d[1], temploadreg3\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + "b.ne 3b\n" + "2:\n" + "ins v14.d[1], temploadreg2\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + "ins v15.d[1], temploadreg3\n" + "cbz %[regs], 4f\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + "ins v12.d[1], temploadreg0\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + "ins v13.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + "ldr d0, [%[a_ptr0], #0x10]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x18]\n" + "ldr d8, [%[b_ptr0]]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" + "ins v0.d[1], temploadreg0\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + "ins v13.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" + "b 5f\n" + "4:\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + "5:\n" + "str q16, [%[c_ptr0]]\n" + "str q17, [%[c_ptr0], #0x10]\n" + "str q18, [%[c_ptr0], #0x20]\n" + "str q19, [%[c_ptr0], #0x30]\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + ".unreq temploadreg0\n" + ".unreq temploadreg1\n" + ".unreq temploadreg2\n" + ".unreq temploadreg3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "c_ptr1 .req X1\n" + "temploadreg0 .req X2\n" + "temploadreg1 .req X3\n" + "temploadreg2 .req X4\n" + "temploadreg3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "cbz %[beta0], 1f\n" + "movi v16.4s, #0\n" + "ldr q0, [%[a_ptr0]]\n" + "movi v17.4s, #0\n" + "ldr q1, [a_ptr1]\n" + "movi v18.4s, #0\n" + "ldr q8, [%[b_ptr0]]\n" + "movi v19.4s, #0\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "movi v20.4s, #0\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "movi v21.4s, #0\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "movi v22.4s, #0\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "movi v23.4s, #0\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1r {v15.4s}, [%[betaptr]]\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "mul v16.4s, v16.4s, v15.4s\n" + "ldr q20, [c_ptr1]\n" + "mul v17.4s, v17.4s, v15.4s\n" + "ldr q21, [c_ptr1, #0x10]\n" + "mul v18.4s, v18.4s, v15.4s\n" + "ldr q22, [c_ptr1, #0x20]\n" + "mul v19.4s, v19.4s, v15.4s\n" + "ldr q23, [c_ptr1, #0x30]\n" + "mul v20.4s, v20.4s, v15.4s\n" + "ldr q0, [%[a_ptr0]]\n" + "mul v21.4s, v21.4s, v15.4s\n" + "ldr q1, [a_ptr1]\n" + "mul v22.4s, v22.4s, v15.4s\n" + "ldr q8, [%[b_ptr0]]\n" + "mul v23.4s, v23.4s, v15.4s\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "3:\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + "ldr d5, [a_ptr1]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x8]\n" + ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" + "ins v5.d[1], temploadreg1\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "subs %[loops], %[loops], #0x1\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + "ins v15.d[1], temploadreg3\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" + "ins v8.d[1], temploadreg0\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" + "ins v14.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" + "ldr d0, [%[a_ptr0], #-0x10]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" + ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" + "ldr d1, [a_ptr1, #-0x10]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" + "ldr temploadreg1, [a_ptr1, #-0x8]\n" + ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" + "ldr d8, [%[b_ptr0]]\n" + "ins v0.d[1], temploadreg0\n" + ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + "ins v1.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ins v14.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" + "ins v14.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + "ins v9.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + "ins v10.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + "ins v11.d[1], temploadreg3\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + "b.ne 3b\n" + "2:\n" + "ins v14.d[1], temploadreg2\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [c_ptr1]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + "ins v15.d[1], temploadreg3\n" + "cbz %[regs], 4f\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr d5, [a_ptr1]\n" + ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x8]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + "ins v5.d[1], temploadreg1\n" + ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" + "ins v8.d[1], temploadreg0\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" + "ins v14.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" + "ldr d0, [%[a_ptr0], #0x10]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x18]\n" + ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" + "ldr d1, [a_ptr1, #0x10]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" + "ldr temploadreg1, [a_ptr1, #0x18]\n" + ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" + "ldr d8, [%[b_ptr0]]\n" + "ins v0.d[1], temploadreg0\n" + ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + "ins v1.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ins v14.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" + ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" + ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" + ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" + ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" + "b 5f\n" + "4:\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr d5, [a_ptr1]\n" + ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x8]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + "ins v5.d[1], temploadreg1\n" + ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" + "ins v8.d[1], temploadreg0\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" + "5:\n" + "str q16, [%[c_ptr0]]\n" + "str q17, [%[c_ptr0], #0x10]\n" + "str q18, [%[c_ptr0], #0x20]\n" + "str q19, [%[c_ptr0], #0x30]\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + "str q20, [c_ptr1]\n" + "str q21, [c_ptr1, #0x10]\n" + "str q22, [c_ptr1, #0x20]\n" + "str q23, [c_ptr1, #0x30]\n" + ".unreq a_ptr1\n" + ".unreq c_ptr1\n" + ".unreq temploadreg0\n" + ".unreq temploadreg1\n" + ".unreq temploadreg2\n" + ".unreq temploadreg3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "c_ptr1 .req X2\n" + "c_ptr2 .req X3\n" + "temploadreg0 .req X4\n" + "temploadreg1 .req X5\n" + "temploadreg2 .req X6\n" + "temploadreg3 .req X7\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "cbz %[beta0], 1f\n" + "movi v16.4s, #0\n" + "ldr q0, [%[a_ptr0]]\n" + "movi v17.4s, #0\n" + "ldr q1, [a_ptr1]\n" + "movi v18.4s, #0\n" + "ldr q2, [a_ptr2]\n" + "movi v19.4s, #0\n" + "ldr q8, [%[b_ptr0]]\n" + "movi v20.4s, #0\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "movi v21.4s, #0\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "movi v22.4s, #0\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "movi v23.4s, #0\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "movi v24.4s, #0\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "movi v25.4s, #0\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "movi v26.4s, #0\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "movi v27.4s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ins v14.d[1], temploadreg2\n" + "add a_ptr2, a_ptr2, #0x10\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1r {v15.4s}, [%[betaptr]]\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "mul v16.4s, v16.4s, v15.4s\n" + "ldr q20, [c_ptr1]\n" + "mul v17.4s, v17.4s, v15.4s\n" + "ldr q21, [c_ptr1, #0x10]\n" + "mul v18.4s, v18.4s, v15.4s\n" + "ldr q22, [c_ptr1, #0x20]\n" + "mul v19.4s, v19.4s, v15.4s\n" + "ldr q23, [c_ptr1, #0x30]\n" + "mul v20.4s, v20.4s, v15.4s\n" + "ldr q24, [c_ptr2]\n" + "mul v21.4s, v21.4s, v15.4s\n" + "ldr q25, [c_ptr2, #0x10]\n" + "mul v22.4s, v22.4s, v15.4s\n" + "ldr q26, [c_ptr2, #0x20]\n" + "mul v23.4s, v23.4s, v15.4s\n" + "ldr q27, [c_ptr2, #0x30]\n" + "mul v24.4s, v24.4s, v15.4s\n" + "ldr q0, [%[a_ptr0]]\n" + "mul v25.4s, v25.4s, v15.4s\n" + "ldr q1, [a_ptr1]\n" + "mul v26.4s, v26.4s, v15.4s\n" + "ldr q2, [a_ptr2]\n" + "mul v27.4s, v27.4s, v15.4s\n" + "ldr q8, [%[b_ptr0]]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "ins v14.d[1], temploadreg2\n" + "cbz %[loops], 2f\n" + "3:\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + "ldr d5, [a_ptr1]\n" + ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x8]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d6, [a_ptr2]\n" + ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + "ldr temploadreg2, [a_ptr2, #0x8]\n" + ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + "ins v5.d[1], temploadreg1\n" + ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + "ins v6.d[1], temploadreg2\n" + ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + "subs %[loops], %[loops], #0x1\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + "ins v10.d[1], temploadreg2\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" + "ldr d0, [%[a_ptr0], #-0x10]\n" + ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" + ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" + "ldr d1, [a_ptr1, #-0x10]\n" + ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" + "ldr temploadreg1, [a_ptr1, #-0x8]\n" + ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" + "ins v0.d[1], temploadreg0\n" + ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" + "ins v1.d[1], temploadreg1\n" + ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "add a_ptr2, a_ptr2, #0x20\n" + "ins v15.d[1], temploadreg3\n" + "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" + "ldr d2, [a_ptr2, #-0x10]\n" + "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" + ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" + "ldr temploadreg2, [a_ptr2, #-0x8]\n" + ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + "ins v2.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + "ins v9.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + "ins v10.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + "ins v11.d[1], temploadreg3\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + "ins v14.d[1], temploadreg2\n" + "b.ne 3b\n" + "2:\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + "prfm PSTL1KEEP, [c_ptr1]\n" + "prfm PSTL1KEEP, [c_ptr2]\n" + "ins v15.d[1], temploadreg3\n" + "cbz %[regs], 4f\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" + "ldr d5, [a_ptr1]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x8]\n" + ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + "ldr d6, [a_ptr2]\n" + ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" + "ldr temploadreg2, [a_ptr2, #0x8]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" + "ins v5.d[1], temploadreg1\n" + ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" + "ins v6.d[1], temploadreg2\n" + ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" + "ldr d0, [%[a_ptr0], #0x10]\n" + ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x18]\n" + ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" + "ldr d1, [a_ptr1, #0x10]\n" + ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x18]\n" + ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" + "ldr d2, [a_ptr2, #0x10]\n" + ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n" + "ldr temploadreg2, [a_ptr2, #0x18]\n" + ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" + "ins v0.d[1], temploadreg0\n" + ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" + "ins v1.d[1], temploadreg1\n" + ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ins v2.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "ins v15.d[1], temploadreg3\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n" + ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" + ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n" + ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" + ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n" + ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" + ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n" + ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" + ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" + ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n" + "b 5f\n" + "4:\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" + "ldr d5, [a_ptr1]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x8]\n" + ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + "ldr d6, [a_ptr2]\n" + ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" + "ldr temploadreg2, [a_ptr2, #0x8]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" + "ins v5.d[1], temploadreg1\n" + ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" + "ins v6.d[1], temploadreg2\n" + ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" + ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" + ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" + "5:\n" + "str q16, [%[c_ptr0]]\n" + "str q17, [%[c_ptr0], #0x10]\n" + "str q18, [%[c_ptr0], #0x20]\n" + "str q19, [%[c_ptr0], #0x30]\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + "str q20, [c_ptr1]\n" + "str q21, [c_ptr1, #0x10]\n" + "str q22, [c_ptr1, #0x20]\n" + "str q23, [c_ptr1, #0x30]\n" + "str q24, [c_ptr2]\n" + "str q25, [c_ptr2, #0x10]\n" + "str q26, [c_ptr2, #0x20]\n" + "str q27, [c_ptr2, #0x30]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq temploadreg0\n" + ".unreq temploadreg1\n" + ".unreq temploadreg2\n" + ".unreq temploadreg3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory" + ); + break; + default: + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "temploadreg0 .req X6\n" + "temploadreg1 .req X7\n" + "temploadreg2 .req X8\n" + "temploadreg3 .req X9\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "cbz %[beta0], 1f\n" + "movi v16.4s, #0\n" + "ldr q0, [%[a_ptr0]]\n" + "movi v17.4s, #0\n" + "ldr q1, [a_ptr1]\n" + "movi v18.4s, #0\n" + "ldr q2, [a_ptr2]\n" + "movi v19.4s, #0\n" + "ldr q3, [a_ptr3]\n" + "movi v20.4s, #0\n" + "ldr q8, [%[b_ptr0]]\n" + "movi v21.4s, #0\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "movi v22.4s, #0\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "movi v23.4s, #0\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "movi v24.4s, #0\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "movi v25.4s, #0\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "movi v26.4s, #0\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "movi v27.4s, #0\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "movi v28.4s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "movi v29.4s, #0\n" + "ins v14.d[1], temploadreg2\n" + "movi v30.4s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "movi v31.4s, #0\n" + "add a_ptr2, a_ptr2, #0x10\n" + "add a_ptr3, a_ptr3, #0x10\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1r {v15.4s}, [%[betaptr]]\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "mul v16.4s, v16.4s, v15.4s\n" + "ldr q20, [c_ptr1]\n" + "mul v17.4s, v17.4s, v15.4s\n" + "ldr q21, [c_ptr1, #0x10]\n" + "mul v18.4s, v18.4s, v15.4s\n" + "ldr q22, [c_ptr1, #0x20]\n" + "mul v19.4s, v19.4s, v15.4s\n" + "ldr q23, [c_ptr1, #0x30]\n" + "mul v20.4s, v20.4s, v15.4s\n" + "ldr q24, [c_ptr2]\n" + "mul v21.4s, v21.4s, v15.4s\n" + "ldr q25, [c_ptr2, #0x10]\n" + "mul v22.4s, v22.4s, v15.4s\n" + "ldr q26, [c_ptr2, #0x20]\n" + "mul v23.4s, v23.4s, v15.4s\n" + "ldr q27, [c_ptr2, #0x30]\n" + "mul v24.4s, v24.4s, v15.4s\n" + "ldr q28, [c_ptr3]\n" + "mul v25.4s, v25.4s, v15.4s\n" + "ldr q29, [c_ptr3, #0x10]\n" + "mul v26.4s, v26.4s, v15.4s\n" + "ldr q30, [c_ptr3, #0x20]\n" + "mul v27.4s, v27.4s, v15.4s\n" + "ldr q31, [c_ptr3, #0x30]\n" + "mul v28.4s, v28.4s, v15.4s\n" + "ldr q0, [%[a_ptr0]]\n" + "mul v29.4s, v29.4s, v15.4s\n" + "ldr q1, [a_ptr1]\n" + "mul v30.4s, v30.4s, v15.4s\n" + "ldr q2, [a_ptr2]\n" + "mul v31.4s, v31.4s, v15.4s\n" + "ldr q3, [a_ptr3]\n" + "ldr q8, [%[b_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "ins v14.d[1], temploadreg2\n" + "cbz %[loops], 2f\n" + "3:\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr d5, [a_ptr1]\n" + ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x8]\n" + ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" + "ldr d6, [a_ptr2]\n" + ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n" + "ldr temploadreg2, [a_ptr2, #0x8]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d7, [a_ptr3]\n" + ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" + "ldr temploadreg3, [a_ptr3, #0x8]\n" + ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n" + "ins v5.d[1], temploadreg1\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" + "ins v6.d[1], temploadreg2\n" + ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" + "ins v7.d[1], temploadreg3\n" + ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n" + "subs %[loops], %[loops], #0x1\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n" + "ldr d0, [%[a_ptr0], #-0x10]\n" + ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" + ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n" + "ins v0.d[1], temploadreg0\n" + ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n" + "ldr d1, [a_ptr1, #-0x10]\n" + ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #-0x8]\n" + ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n" + "ins v1.d[1], temploadreg1\n" + ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" + "add a_ptr2, a_ptr2, #0x20\n" + ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n" + "ldr d2, [a_ptr2, #-0x10]\n" + ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n" + "ldr temploadreg2, [a_ptr2, #-0x8]\n" + ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n" + "ins v2.d[1], temploadreg2\n" + ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" + "add a_ptr3, a_ptr3, #0x20\n" + ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n" + "ldr d3, [a_ptr3, #-0x10]\n" + ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n" + "ldr temploadreg3, [a_ptr3, #-0x8]\n" + ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" + "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" + ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n" + "ins v3.d[1], temploadreg3\n" + ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" + "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" + ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + "prfm PLDL1KEEP, [a_ptr3, #0x40]\n" + "ins v14.d[1], temploadreg2\n" + "b.ne 3b\n" + "2:\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + "prfm PSTL1KEEP, [c_ptr1]\n" + "prfm PSTL1KEEP, [c_ptr2]\n" + "prfm PSTL1KEEP, [c_ptr3]\n" + "ins v15.d[1], temploadreg3\n" + "cbz %[regs], 4f\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" + "ldr d5, [a_ptr1]\n" + ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x8]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr d6, [a_ptr2]\n" + ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + "ldr temploadreg2, [a_ptr2, #0x8]\n" + ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" + "ldr d7, [a_ptr3]\n" + ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n" + "ldr temploadreg3, [a_ptr3, #0x8]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + "ins v5.d[1], temploadreg1\n" + ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n" + "ins v6.d[1], temploadreg2\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" + "ins v7.d[1], temploadreg3\n" + ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + "ldr temploadreg0, [%[a_ptr0], #0x18]\n" + ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + "ldr d0, [%[a_ptr0], #0x10]\n" + ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" + "ldr d1, [a_ptr1, #0x10]\n" + ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" + "ldr temploadreg1, [a_ptr1, #0x18]\n" + ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" + "ldr d2, [a_ptr2, #0x10]\n" + ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n" + "ldr temploadreg2, [a_ptr2, #0x18]\n" + ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n" + "ldr d3, [a_ptr3, #0x10]\n" + ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" + "ldr temploadreg3, [a_ptr3, #0x18]\n" + ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n" + "ins v0.d[1], temploadreg0\n" + ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n" + "ins v1.d[1], temploadreg1\n" + ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" + "ins v2.d[1], temploadreg2\n" + ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" + "ins v3.d[1], temploadreg3\n" + ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" + ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n" + ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" + ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" + ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n" + ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n" + ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" + ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" + ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n" + ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n" + ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" + ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" + ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n" + ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n" + ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" + ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n" + ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n" + ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" + ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n" + ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n" + ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" + ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n" + ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n" + ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" + ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" + ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n" + ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n" + "b 5f\n" + "4:\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" + "ldr d5, [a_ptr1]\n" + ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x8]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr d6, [a_ptr2]\n" + ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + "ldr temploadreg2, [a_ptr2, #0x8]\n" + ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" + "ldr d7, [a_ptr3]\n" + ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n" + "ldr temploadreg3, [a_ptr3, #0x8]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + "ins v5.d[1], temploadreg1\n" + ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n" + "ins v6.d[1], temploadreg2\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" + "ins v7.d[1], temploadreg3\n" + ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" + ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" + ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" + ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" + ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" + ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" + ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" + ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" + ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" + ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" + ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" + ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n" + "5:\n" + "str q16, [%[c_ptr0]]\n" + "str q17, [%[c_ptr0], #0x10]\n" + "str q18, [%[c_ptr0], #0x20]\n" + "str q19, [%[c_ptr0], #0x30]\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + "str q20, [c_ptr1]\n" + "str q21, [c_ptr1, #0x10]\n" + "str q22, [c_ptr1, #0x20]\n" + "str q23, [c_ptr1, #0x30]\n" + "str q24, [c_ptr2]\n" + "str q25, [c_ptr2, #0x10]\n" + "str q26, [c_ptr2, #0x20]\n" + "str q27, [c_ptr2, #0x30]\n" + "str q28, [c_ptr3]\n" + "str q29, [c_ptr3, #0x10]\n" + "str q30, [c_ptr3, #0x20]\n" + "str q31, [c_ptr3, #0x30]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq temploadreg0\n" + ".unreq temploadreg1\n" + ".unreq temploadreg2\n" + ".unreq temploadreg3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory" + ); + break; + } + } + } +} + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp new file mode 100644 index 0000000000..01791391c8 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp @@ -0,0 +1,1605 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include <algorithm> + +#include <cstdint> +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int32_t beta, int M, int N, int K) { + const long beta0 = (beta == 0); + const int K_stride = ((K + 3) / 4) * 4; + const long loops_count = ((K + 16) / 32) - 1; + K -= loops_count * 32; + const long regs_count = (K / 16) - 1; + + for (int y=0; y<M; y+=4) { + const int8_t * const a_ptr0_base = A + (y * lda); + const unsigned long ldab = lda * sizeof(int8_t); + + int32_t *c_ptr0 = C + (y * ldc); + const unsigned long ldcb = ldc * sizeof(int32_t); + + for (int x0=0; x0<N; x0+=16ul) { + const long width = std::min((unsigned long)N-x0, 16ul); + const int32_t *betaptr = β + long loops = loops_count; + long regs = regs_count; + const int8_t *a_ptr0 = a_ptr0_base; + const int8_t *b_ptr0 = B + (K_stride * x0); + + switch(M-y) { + case 1: + __asm __volatile ( + "cbz %[beta0], 1f\n" + "movi v16.4s, #0\n" + "ldr q0, [%[a_ptr0]]\n" + "movi v17.4s, #0\n" + "ldr q8, [%[b_ptr0]]\n" + "movi v18.4s, #0\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "movi v19.4s, #0\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1r {v15.4s}, [%[betaptr]]\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "mul v16.4s, v16.4s, v15.4s\n" + "ldr q0, [%[a_ptr0]]\n" + "mul v17.4s, v17.4s, v15.4s\n" + "ldr q8, [%[b_ptr0]]\n" + "mul v18.4s, v18.4s, v15.4s\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "mul v19.4s, v19.4s, v15.4s\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "3:\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + "subs %[loops], %[loops], #0x1\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" + "ldr q0, [%[a_ptr0], #-0x10]\n" + ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + "b.ne 3b\n" + "2:\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "cbz %[regs], 4f\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" + "ldr q0, [%[a_ptr0], #0x10]\n" + ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" + ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" + "b 5f\n" + "4:\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + "5:\n" + "str q16, [%[c_ptr0]]\n" + "str q17, [%[c_ptr0], #0x10]\n" + "str q18, [%[c_ptr0], #0x20]\n" + "str q19, [%[c_ptr0], #0x30]\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "c_ptr1 .req X1\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "cbz %[beta0], 1f\n" + "movi v16.4s, #0\n" + "ldr q0, [%[a_ptr0]]\n" + "movi v17.4s, #0\n" + "ldr q1, [a_ptr1]\n" + "movi v18.4s, #0\n" + "ldr q8, [%[b_ptr0]]\n" + "movi v19.4s, #0\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "movi v20.4s, #0\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "movi v21.4s, #0\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "movi v22.4s, #0\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "movi v23.4s, #0\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1r {v15.4s}, [%[betaptr]]\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "mul v16.4s, v16.4s, v15.4s\n" + "ldr q20, [c_ptr1]\n" + "mul v17.4s, v17.4s, v15.4s\n" + "ldr q21, [c_ptr1, #0x10]\n" + "mul v18.4s, v18.4s, v15.4s\n" + "ldr q22, [c_ptr1, #0x20]\n" + "mul v19.4s, v19.4s, v15.4s\n" + "ldr q23, [c_ptr1, #0x30]\n" + "mul v20.4s, v20.4s, v15.4s\n" + "ldr q0, [%[a_ptr0]]\n" + "mul v21.4s, v21.4s, v15.4s\n" + "ldr q1, [a_ptr1]\n" + "mul v22.4s, v22.4s, v15.4s\n" + "ldr q8, [%[b_ptr0]]\n" + "mul v23.4s, v23.4s, v15.4s\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "3:\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q5, [a_ptr1]\n" + ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + "subs %[loops], %[loops], #0x1\n" + ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" + ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + "ldr q0, [%[a_ptr0], #-0x10]\n" + ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" + "ldr q1, [a_ptr1, #-0x10]\n" + ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" + ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" + ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" + ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" + ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" + ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" + ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" + ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" + ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" + ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" + ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" + ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" + "b.ne 3b\n" + "2:\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "prfm PSTL1KEEP, [c_ptr1]\n" + "cbz %[regs], 4f\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q5, [a_ptr1]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + "ldr q0, [%[a_ptr0], #0x10]\n" + ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" + "ldr q1, [a_ptr1, #0x10]\n" + ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" + ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" + ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" + ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" + ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" + ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" + ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" + ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" + ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" + ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" + ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" + ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" + ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" + ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" + ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" + ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" + ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" + ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" + ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" + ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" + "b 5f\n" + "4:\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q5, [a_ptr1]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" + "5:\n" + "str q16, [%[c_ptr0]]\n" + "str q17, [%[c_ptr0], #0x10]\n" + "str q18, [%[c_ptr0], #0x20]\n" + "str q19, [%[c_ptr0], #0x30]\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + "str q20, [c_ptr1]\n" + "str q21, [c_ptr1, #0x10]\n" + "str q22, [c_ptr1, #0x20]\n" + "str q23, [c_ptr1, #0x30]\n" + ".unreq a_ptr1\n" + ".unreq c_ptr1\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "c_ptr1 .req X2\n" + "c_ptr2 .req X3\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "cbz %[beta0], 1f\n" + "movi v16.4s, #0\n" + "ldr q0, [%[a_ptr0]]\n" + "movi v17.4s, #0\n" + "ldr q1, [a_ptr1]\n" + "movi v18.4s, #0\n" + "ldr q2, [a_ptr2]\n" + "movi v19.4s, #0\n" + "ldr q8, [%[b_ptr0]]\n" + "movi v20.4s, #0\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "movi v21.4s, #0\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "movi v22.4s, #0\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "movi v23.4s, #0\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "movi v24.4s, #0\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "movi v25.4s, #0\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + "movi v26.4s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "movi v27.4s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add a_ptr2, a_ptr2, #0x10\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1r {v15.4s}, [%[betaptr]]\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "mul v16.4s, v16.4s, v15.4s\n" + "ldr q20, [c_ptr1]\n" + "mul v17.4s, v17.4s, v15.4s\n" + "ldr q21, [c_ptr1, #0x10]\n" + "mul v18.4s, v18.4s, v15.4s\n" + "ldr q22, [c_ptr1, #0x20]\n" + "mul v19.4s, v19.4s, v15.4s\n" + "ldr q23, [c_ptr1, #0x30]\n" + "mul v20.4s, v20.4s, v15.4s\n" + "ldr q24, [c_ptr2]\n" + "mul v21.4s, v21.4s, v15.4s\n" + "ldr q25, [c_ptr2, #0x10]\n" + "mul v22.4s, v22.4s, v15.4s\n" + "ldr q26, [c_ptr2, #0x20]\n" + "mul v23.4s, v23.4s, v15.4s\n" + "ldr q27, [c_ptr2, #0x30]\n" + "mul v24.4s, v24.4s, v15.4s\n" + "ldr q0, [%[a_ptr0]]\n" + "mul v25.4s, v25.4s, v15.4s\n" + "ldr q1, [a_ptr1]\n" + "mul v26.4s, v26.4s, v15.4s\n" + "ldr q2, [a_ptr2]\n" + "mul v27.4s, v27.4s, v15.4s\n" + "ldr q8, [%[b_ptr0]]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "3:\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" + "ldr q5, [a_ptr1]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q6, [a_ptr2]\n" + ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "subs %[loops], %[loops], #0x1\n" + ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + "add a_ptr2, a_ptr2, #0x20\n" + ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" + "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" + ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" + ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" + ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" + ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" + ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" + ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" + ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" + ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" + ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + "ldr q0, [%[a_ptr0], #-0x10]\n" + ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" + "ldr q1, [a_ptr1, #-0x10]\n" + ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" + "ldr q2, [a_ptr2, #-0x10]\n" + ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" + ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" + ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" + ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" + ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" + ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" + ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" + ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" + ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" + ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" + ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" + ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" + ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" + ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" + ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" + ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" + ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" + ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" + ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" + ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" + ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" + ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" + ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" + ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" + ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" + ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" + ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" + ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n" + "b.ne 3b\n" + "2:\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "prfm PSTL1KEEP, [c_ptr1]\n" + "prfm PSTL1KEEP, [c_ptr2]\n" + "cbz %[regs], 4f\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q5, [a_ptr1]\n" + ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" + "ldr q6, [a_ptr2]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" + ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" + ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" + ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" + ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" + ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" + ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" + ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" + ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" + ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + "ldr q0, [%[a_ptr0], #0x10]\n" + ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" + "ldr q1, [a_ptr1, #0x10]\n" + ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" + "ldr q2, [a_ptr2, #0x10]\n" + ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" + ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" + ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" + ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" + ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" + ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" + ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" + ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" + ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" + ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" + ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" + ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" + ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" + ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" + ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" + ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" + ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" + ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n" + ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" + ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" + ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n" + ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" + ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" + ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n" + ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" + ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" + ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n" + ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" + ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n" + ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" + ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n" + ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" + ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n" + ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" + ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" + ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n" + "b 5f\n" + "4:\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q5, [a_ptr1]\n" + ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" + "ldr q6, [a_ptr2]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" + ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" + ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" + ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" + ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" + ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" + ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" + ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" + ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" + ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" + ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" + "5:\n" + "str q16, [%[c_ptr0]]\n" + "str q17, [%[c_ptr0], #0x10]\n" + "str q18, [%[c_ptr0], #0x20]\n" + "str q19, [%[c_ptr0], #0x30]\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + "str q20, [c_ptr1]\n" + "str q21, [c_ptr1, #0x10]\n" + "str q22, [c_ptr1, #0x20]\n" + "str q23, [c_ptr1, #0x30]\n" + "str q24, [c_ptr2]\n" + "str q25, [c_ptr2, #0x10]\n" + "str q26, [c_ptr2, #0x20]\n" + "str q27, [c_ptr2, #0x30]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory" + ); + break; + default: + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "cbz %[beta0], 1f\n" + "movi v16.4s, #0\n" + "ldr q0, [%[a_ptr0]]\n" + "movi v17.4s, #0\n" + "ldr q1, [a_ptr1]\n" + "movi v18.4s, #0\n" + "ldr q2, [a_ptr2]\n" + "movi v19.4s, #0\n" + "ldr q3, [a_ptr3]\n" + "movi v20.4s, #0\n" + "ldr q8, [%[b_ptr0]]\n" + "movi v21.4s, #0\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "movi v22.4s, #0\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "movi v23.4s, #0\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "movi v24.4s, #0\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "movi v25.4s, #0\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "movi v26.4s, #0\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + "movi v27.4s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "movi v28.4s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "movi v29.4s, #0\n" + "add a_ptr2, a_ptr2, #0x10\n" + "movi v30.4s, #0\n" + "add a_ptr3, a_ptr3, #0x10\n" + "movi v31.4s, #0\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1r {v15.4s}, [%[betaptr]]\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "mul v16.4s, v16.4s, v15.4s\n" + "ldr q20, [c_ptr1]\n" + "mul v17.4s, v17.4s, v15.4s\n" + "ldr q21, [c_ptr1, #0x10]\n" + "mul v18.4s, v18.4s, v15.4s\n" + "ldr q22, [c_ptr1, #0x20]\n" + "mul v19.4s, v19.4s, v15.4s\n" + "ldr q23, [c_ptr1, #0x30]\n" + "mul v20.4s, v20.4s, v15.4s\n" + "ldr q24, [c_ptr2]\n" + "mul v21.4s, v21.4s, v15.4s\n" + "ldr q25, [c_ptr2, #0x10]\n" + "mul v22.4s, v22.4s, v15.4s\n" + "ldr q26, [c_ptr2, #0x20]\n" + "mul v23.4s, v23.4s, v15.4s\n" + "ldr q27, [c_ptr2, #0x30]\n" + "mul v24.4s, v24.4s, v15.4s\n" + "ldr q28, [c_ptr3]\n" + "mul v25.4s, v25.4s, v15.4s\n" + "ldr q29, [c_ptr3, #0x10]\n" + "mul v26.4s, v26.4s, v15.4s\n" + "ldr q30, [c_ptr3, #0x20]\n" + "mul v27.4s, v27.4s, v15.4s\n" + "ldr q31, [c_ptr3, #0x30]\n" + "mul v28.4s, v28.4s, v15.4s\n" + "ldr q0, [%[a_ptr0]]\n" + "mul v29.4s, v29.4s, v15.4s\n" + "ldr q1, [a_ptr1]\n" + "mul v30.4s, v30.4s, v15.4s\n" + "ldr q2, [a_ptr2]\n" + "mul v31.4s, v31.4s, v15.4s\n" + "ldr q3, [a_ptr3]\n" + "ldr q8, [%[b_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "3:\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" + "ldr q5, [a_ptr1]\n" + ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n" + "ldr q6, [a_ptr2]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q7, [a_ptr3]\n" + ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" + "subs %[loops], %[loops], #0x1\n" + ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + "add a_ptr2, a_ptr2, #0x20\n" + ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" + "add a_ptr3, a_ptr3, #0x20\n" + ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" + "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" + ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" + ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" + "prfm PLDL1KEEP, [a_ptr3, #0x40]\n" + ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" + ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" + ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" + ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" + ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" + ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" + ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" + ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" + ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" + ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" + ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" + ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" + ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" + ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" + ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" + ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" + ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" + ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" + ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + "ldr q0, [%[a_ptr0], #-0x10]\n" + ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" + "ldr q1, [a_ptr1, #-0x10]\n" + ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" + "ldr q2, [a_ptr2, #-0x10]\n" + ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" + "ldr q3, [a_ptr3, #-0x10]\n" + ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" + ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n" + ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" + ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" + ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n" + ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" + ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" + ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n" + ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" + ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" + ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n" + ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" + ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" + ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n" + ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" + ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" + ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n" + ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" + ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" + ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n" + ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" + ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" + ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n" + ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" + ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n" + ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" + ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" + ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n" + ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" + ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" + ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n" + ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" + ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" + ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n" + ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" + ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n" + ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" + ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n" + ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" + ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n" + ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" + ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" + ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n" + ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n" + "b.ne 3b\n" + "2:\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "prfm PSTL1KEEP, [c_ptr1]\n" + "prfm PSTL1KEEP, [c_ptr2]\n" + "prfm PSTL1KEEP, [c_ptr3]\n" + "cbz %[regs], 4f\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q5, [a_ptr1]\n" + ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" + "ldr q6, [a_ptr2]\n" + ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n" + "ldr q7, [a_ptr3]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" + ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" + ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" + ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" + ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" + ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" + ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" + ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" + ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" + ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" + ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" + ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" + ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" + ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" + ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" + ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" + ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" + ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" + ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" + ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" + ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" + ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" + ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" + ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + "ldr q0, [%[a_ptr0], #0x10]\n" + ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" + "ldr q1, [a_ptr1, #0x10]\n" + ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" + "ldr q2, [a_ptr2, #0x10]\n" + ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n" + "ldr q3, [a_ptr3, #0x10]\n" + ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n" + ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n" + ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n" + ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n" + ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n" + ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n" + ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" + ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n" + ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n" + ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n" + ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n" + ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n" + ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n" + ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n" + ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n" + ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n" + ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n" + ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n" + ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n" + ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n" + ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n" + ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n" + ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n" + ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n" + ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n" + ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n" + ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n" + ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n" + ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n" + ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n" + ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n" + ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n" + ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n" + ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n" + ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n" + ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n" + ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n" + ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n" + ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n" + ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n" + ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n" + ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n" + ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n" + ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n" + ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n" + ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n" + ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n" + ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n" + ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n" + ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n" + ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n" + ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n" + "b 5f\n" + "4:\n" + ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q5, [a_ptr1]\n" + ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" + "ldr q6, [a_ptr2]\n" + ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n" + "ldr q7, [a_ptr3]\n" + ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n" + ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n" + ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n" + ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n" + ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n" + ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n" + ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n" + ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n" + ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n" + ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n" + ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n" + ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n" + ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n" + ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n" + ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n" + ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n" + ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n" + ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n" + ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n" + ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n" + ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n" + ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n" + ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n" + ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n" + ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n" + ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n" + ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n" + ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n" + ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n" + ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n" + ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n" + ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n" + ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n" + ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n" + ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n" + ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n" + ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n" + ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n" + ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n" + ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n" + ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n" + ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n" + ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n" + "5:\n" + "str q16, [%[c_ptr0]]\n" + "str q17, [%[c_ptr0], #0x10]\n" + "str q18, [%[c_ptr0], #0x20]\n" + "str q19, [%[c_ptr0], #0x30]\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + "str q20, [c_ptr1]\n" + "str q21, [c_ptr1, #0x10]\n" + "str q22, [c_ptr1, #0x20]\n" + "str q23, [c_ptr1, #0x30]\n" + "str q24, [c_ptr2]\n" + "str q25, [c_ptr2, #0x10]\n" + "str q26, [c_ptr2, #0x20]\n" + "str q27, [c_ptr2, #0x30]\n" + "str q28, [c_ptr3]\n" + "str q29, [c_ptr3, #0x10]\n" + "str q30, [c_ptr3, #0x20]\n" + "str q31, [c_ptr3, #0x30]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" + ); + break; + } + } + } +} + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp new file mode 100644 index 0000000000..7fb9b5c131 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include <cstdint> +#include "../std_transforms_fixed.hpp" + +namespace arm_gemm +{ + +// Actual kernel implementations +void a64_hybrid_u8u32_dot_16x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int); +void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int); + +class hybrid_u8u32_dot_16x4 +{ +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_height() + { + return 4; + } + + static unsigned int out_width() + { + return 16; + } + + static unsigned int k_unroll() + { + return 4; + } + + StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_hybrid_u8u32_dot_16x4; + + hybrid_u8u32_dot_16x4(const CPUInfo *ci) + { + if (ci->get_cpu_model() == CPUModel::A55r1) { + kernel = a64_hybrid_u8u32_dot_16x4_a55; + } + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp new file mode 100644 index 0000000000..230ecdce2d --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp @@ -0,0 +1,2271 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include <algorithm> + +#include <cstdint> +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) { + const long beta0 = (beta == 0u); + const int K_stride = ((K + 3) / 4) * 4; + const long loops_count = ((K + 16) / 32) - 1; + K -= loops_count * 32; + const long regs_count = (K / 16) - 1; + + for (int y=0; y<M; y+=4) { + const uint8_t * const a_ptr0_base = A + (y * lda); + const unsigned long ldab = lda * sizeof(uint8_t); + + uint32_t *c_ptr0 = C + (y * ldc); + const unsigned long ldcb = ldc * sizeof(uint32_t); + + for (int x0=0; x0<N; x0+=16ul) { + const long width = std::min((unsigned long)N-x0, 16ul); + const uint32_t *betaptr = β + long loops = loops_count; + long regs = regs_count; + const uint8_t *a_ptr0 = a_ptr0_base; + const uint8_t *b_ptr0 = B + (K_stride * x0); + + switch(M-y) { + case 1: + __asm __volatile ( + "temploadreg0 .req X0\n" + "temploadreg1 .req X1\n" + "temploadreg2 .req X2\n" + "temploadreg3 .req X3\n" + "cbz %[beta0], 1f\n" + "movi v16.4s, #0\n" + "ldr q0, [%[a_ptr0]]\n" + "movi v17.4s, #0\n" + "ldr q8, [%[b_ptr0]]\n" + "movi v18.4s, #0\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "movi v19.4s, #0\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1r {v15.4s}, [%[betaptr]]\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "mul v16.4s, v16.4s, v15.4s\n" + "ldr q0, [%[a_ptr0]]\n" + "mul v17.4s, v17.4s, v15.4s\n" + "ldr q8, [%[b_ptr0]]\n" + "mul v18.4s, v18.4s, v15.4s\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "mul v19.4s, v19.4s, v15.4s\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "3:\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + "subs %[loops], %[loops], #0x1\n" + "ins v4.d[1], temploadreg0\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "ins v15.d[1], temploadreg3\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + "ins v12.d[1], temploadreg0\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + "ins v13.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + "ldr d0, [%[a_ptr0], #-0x10]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" + "ldr d8, [%[b_ptr0]]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" + "ins v0.d[1], temploadreg0\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + "ins v13.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + "ins v12.d[1], temploadreg0\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + "ins v13.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + "ins v9.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + "ins v10.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + "ins v11.d[1], temploadreg3\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + "b.ne 3b\n" + "2:\n" + "ins v14.d[1], temploadreg2\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + "ins v15.d[1], temploadreg3\n" + "cbz %[regs], 4f\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + "ins v12.d[1], temploadreg0\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + "ins v13.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + "ldr d0, [%[a_ptr0], #0x10]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x18]\n" + "ldr d8, [%[b_ptr0]]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" + "ins v0.d[1], temploadreg0\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + "ins v13.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" + "b 5f\n" + "4:\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + "5:\n" + "str q16, [%[c_ptr0]]\n" + "str q17, [%[c_ptr0], #0x10]\n" + "str q18, [%[c_ptr0], #0x20]\n" + "str q19, [%[c_ptr0], #0x30]\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + ".unreq temploadreg0\n" + ".unreq temploadreg1\n" + ".unreq temploadreg2\n" + ".unreq temploadreg3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "c_ptr1 .req X1\n" + "temploadreg0 .req X2\n" + "temploadreg1 .req X3\n" + "temploadreg2 .req X4\n" + "temploadreg3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "cbz %[beta0], 1f\n" + "movi v16.4s, #0\n" + "ldr q0, [%[a_ptr0]]\n" + "movi v17.4s, #0\n" + "ldr q1, [a_ptr1]\n" + "movi v18.4s, #0\n" + "ldr q8, [%[b_ptr0]]\n" + "movi v19.4s, #0\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "movi v20.4s, #0\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "movi v21.4s, #0\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "movi v22.4s, #0\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "movi v23.4s, #0\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1r {v15.4s}, [%[betaptr]]\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "mul v16.4s, v16.4s, v15.4s\n" + "ldr q20, [c_ptr1]\n" + "mul v17.4s, v17.4s, v15.4s\n" + "ldr q21, [c_ptr1, #0x10]\n" + "mul v18.4s, v18.4s, v15.4s\n" + "ldr q22, [c_ptr1, #0x20]\n" + "mul v19.4s, v19.4s, v15.4s\n" + "ldr q23, [c_ptr1, #0x30]\n" + "mul v20.4s, v20.4s, v15.4s\n" + "ldr q0, [%[a_ptr0]]\n" + "mul v21.4s, v21.4s, v15.4s\n" + "ldr q1, [a_ptr1]\n" + "mul v22.4s, v22.4s, v15.4s\n" + "ldr q8, [%[b_ptr0]]\n" + "mul v23.4s, v23.4s, v15.4s\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "3:\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + "ldr d5, [a_ptr1]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x8]\n" + ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" + "ins v5.d[1], temploadreg1\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "subs %[loops], %[loops], #0x1\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + "ins v15.d[1], temploadreg3\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" + "ins v8.d[1], temploadreg0\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" + "ins v14.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" + "ldr d0, [%[a_ptr0], #-0x10]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" + ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" + "ldr d1, [a_ptr1, #-0x10]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" + "ldr temploadreg1, [a_ptr1, #-0x8]\n" + ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" + "ldr d8, [%[b_ptr0]]\n" + "ins v0.d[1], temploadreg0\n" + ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + "ins v1.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ins v14.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" + "ins v14.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + "ins v9.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + "ins v10.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + "ins v11.d[1], temploadreg3\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + "b.ne 3b\n" + "2:\n" + "ins v14.d[1], temploadreg2\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [c_ptr1]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + "ins v15.d[1], temploadreg3\n" + "cbz %[regs], 4f\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr d5, [a_ptr1]\n" + ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x8]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + "ins v5.d[1], temploadreg1\n" + ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" + "ins v8.d[1], temploadreg0\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" + "ins v14.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" + "ldr d0, [%[a_ptr0], #0x10]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x18]\n" + ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" + "ldr d1, [a_ptr1, #0x10]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" + "ldr temploadreg1, [a_ptr1, #0x18]\n" + ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" + "ldr d8, [%[b_ptr0]]\n" + "ins v0.d[1], temploadreg0\n" + ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + "ins v1.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ins v14.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" + ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" + ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" + ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" + ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" + "b 5f\n" + "4:\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr d5, [a_ptr1]\n" + ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x8]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + "ins v5.d[1], temploadreg1\n" + ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" + "ins v8.d[1], temploadreg0\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" + "5:\n" + "str q16, [%[c_ptr0]]\n" + "str q17, [%[c_ptr0], #0x10]\n" + "str q18, [%[c_ptr0], #0x20]\n" + "str q19, [%[c_ptr0], #0x30]\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + "str q20, [c_ptr1]\n" + "str q21, [c_ptr1, #0x10]\n" + "str q22, [c_ptr1, #0x20]\n" + "str q23, [c_ptr1, #0x30]\n" + ".unreq a_ptr1\n" + ".unreq c_ptr1\n" + ".unreq temploadreg0\n" + ".unreq temploadreg1\n" + ".unreq temploadreg2\n" + ".unreq temploadreg3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "c_ptr1 .req X2\n" + "c_ptr2 .req X3\n" + "temploadreg0 .req X4\n" + "temploadreg1 .req X5\n" + "temploadreg2 .req X6\n" + "temploadreg3 .req X7\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "cbz %[beta0], 1f\n" + "movi v16.4s, #0\n" + "ldr q0, [%[a_ptr0]]\n" + "movi v17.4s, #0\n" + "ldr q1, [a_ptr1]\n" + "movi v18.4s, #0\n" + "ldr q2, [a_ptr2]\n" + "movi v19.4s, #0\n" + "ldr q8, [%[b_ptr0]]\n" + "movi v20.4s, #0\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "movi v21.4s, #0\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "movi v22.4s, #0\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "movi v23.4s, #0\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "movi v24.4s, #0\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "movi v25.4s, #0\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "movi v26.4s, #0\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "movi v27.4s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ins v14.d[1], temploadreg2\n" + "add a_ptr2, a_ptr2, #0x10\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1r {v15.4s}, [%[betaptr]]\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "mul v16.4s, v16.4s, v15.4s\n" + "ldr q20, [c_ptr1]\n" + "mul v17.4s, v17.4s, v15.4s\n" + "ldr q21, [c_ptr1, #0x10]\n" + "mul v18.4s, v18.4s, v15.4s\n" + "ldr q22, [c_ptr1, #0x20]\n" + "mul v19.4s, v19.4s, v15.4s\n" + "ldr q23, [c_ptr1, #0x30]\n" + "mul v20.4s, v20.4s, v15.4s\n" + "ldr q24, [c_ptr2]\n" + "mul v21.4s, v21.4s, v15.4s\n" + "ldr q25, [c_ptr2, #0x10]\n" + "mul v22.4s, v22.4s, v15.4s\n" + "ldr q26, [c_ptr2, #0x20]\n" + "mul v23.4s, v23.4s, v15.4s\n" + "ldr q27, [c_ptr2, #0x30]\n" + "mul v24.4s, v24.4s, v15.4s\n" + "ldr q0, [%[a_ptr0]]\n" + "mul v25.4s, v25.4s, v15.4s\n" + "ldr q1, [a_ptr1]\n" + "mul v26.4s, v26.4s, v15.4s\n" + "ldr q2, [a_ptr2]\n" + "mul v27.4s, v27.4s, v15.4s\n" + "ldr q8, [%[b_ptr0]]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "ins v14.d[1], temploadreg2\n" + "cbz %[loops], 2f\n" + "3:\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + "ldr d5, [a_ptr1]\n" + ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x8]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d6, [a_ptr2]\n" + ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + "ldr temploadreg2, [a_ptr2, #0x8]\n" + ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + "ins v5.d[1], temploadreg1\n" + ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + "ins v6.d[1], temploadreg2\n" + ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + "subs %[loops], %[loops], #0x1\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + "ins v10.d[1], temploadreg2\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" + "ldr d0, [%[a_ptr0], #-0x10]\n" + ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" + ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" + "ldr d1, [a_ptr1, #-0x10]\n" + ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" + "ldr temploadreg1, [a_ptr1, #-0x8]\n" + ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" + "ins v0.d[1], temploadreg0\n" + ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" + "ins v1.d[1], temploadreg1\n" + ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "add a_ptr2, a_ptr2, #0x20\n" + "ins v15.d[1], temploadreg3\n" + "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" + "ldr d2, [a_ptr2, #-0x10]\n" + "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" + ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" + "ldr temploadreg2, [a_ptr2, #-0x8]\n" + ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + "ins v2.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + "ins v9.d[1], temploadreg1\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + "ins v10.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + "ins v11.d[1], temploadreg3\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + "ins v14.d[1], temploadreg2\n" + "b.ne 3b\n" + "2:\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + "prfm PSTL1KEEP, [c_ptr1]\n" + "prfm PSTL1KEEP, [c_ptr2]\n" + "ins v15.d[1], temploadreg3\n" + "cbz %[regs], 4f\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" + "ldr d5, [a_ptr1]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x8]\n" + ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + "ldr d6, [a_ptr2]\n" + ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" + "ldr temploadreg2, [a_ptr2, #0x8]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" + "ins v5.d[1], temploadreg1\n" + ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" + "ins v6.d[1], temploadreg2\n" + ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" + "ldr d0, [%[a_ptr0], #0x10]\n" + ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x18]\n" + ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" + "ldr d1, [a_ptr1, #0x10]\n" + ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x18]\n" + ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" + "ldr d2, [a_ptr2, #0x10]\n" + ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" + "ldr temploadreg2, [a_ptr2, #0x18]\n" + ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" + "ins v0.d[1], temploadreg0\n" + ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" + "ins v1.d[1], temploadreg1\n" + ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + "ins v2.d[1], temploadreg2\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + "ins v15.d[1], temploadreg3\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" + ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" + ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" + ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" + ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" + ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" + ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" + ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" + ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" + ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" + "b 5f\n" + "4:\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" + "ldr d5, [a_ptr1]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x8]\n" + ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + "ldr d6, [a_ptr2]\n" + ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" + "ldr temploadreg2, [a_ptr2, #0x8]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" + "ins v5.d[1], temploadreg1\n" + ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" + "ins v6.d[1], temploadreg2\n" + ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" + ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" + ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" + "5:\n" + "str q16, [%[c_ptr0]]\n" + "str q17, [%[c_ptr0], #0x10]\n" + "str q18, [%[c_ptr0], #0x20]\n" + "str q19, [%[c_ptr0], #0x30]\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + "str q20, [c_ptr1]\n" + "str q21, [c_ptr1, #0x10]\n" + "str q22, [c_ptr1, #0x20]\n" + "str q23, [c_ptr1, #0x30]\n" + "str q24, [c_ptr2]\n" + "str q25, [c_ptr2, #0x10]\n" + "str q26, [c_ptr2, #0x20]\n" + "str q27, [c_ptr2, #0x30]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq temploadreg0\n" + ".unreq temploadreg1\n" + ".unreq temploadreg2\n" + ".unreq temploadreg3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory" + ); + break; + default: + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "temploadreg0 .req X6\n" + "temploadreg1 .req X7\n" + "temploadreg2 .req X8\n" + "temploadreg3 .req X9\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "cbz %[beta0], 1f\n" + "movi v16.4s, #0\n" + "ldr q0, [%[a_ptr0]]\n" + "movi v17.4s, #0\n" + "ldr q1, [a_ptr1]\n" + "movi v18.4s, #0\n" + "ldr q2, [a_ptr2]\n" + "movi v19.4s, #0\n" + "ldr q3, [a_ptr3]\n" + "movi v20.4s, #0\n" + "ldr q8, [%[b_ptr0]]\n" + "movi v21.4s, #0\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "movi v22.4s, #0\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "movi v23.4s, #0\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "movi v24.4s, #0\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "movi v25.4s, #0\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "movi v26.4s, #0\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "movi v27.4s, #0\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "movi v28.4s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "movi v29.4s, #0\n" + "ins v14.d[1], temploadreg2\n" + "movi v30.4s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "movi v31.4s, #0\n" + "add a_ptr2, a_ptr2, #0x10\n" + "add a_ptr3, a_ptr3, #0x10\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1r {v15.4s}, [%[betaptr]]\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "mul v16.4s, v16.4s, v15.4s\n" + "ldr q20, [c_ptr1]\n" + "mul v17.4s, v17.4s, v15.4s\n" + "ldr q21, [c_ptr1, #0x10]\n" + "mul v18.4s, v18.4s, v15.4s\n" + "ldr q22, [c_ptr1, #0x20]\n" + "mul v19.4s, v19.4s, v15.4s\n" + "ldr q23, [c_ptr1, #0x30]\n" + "mul v20.4s, v20.4s, v15.4s\n" + "ldr q24, [c_ptr2]\n" + "mul v21.4s, v21.4s, v15.4s\n" + "ldr q25, [c_ptr2, #0x10]\n" + "mul v22.4s, v22.4s, v15.4s\n" + "ldr q26, [c_ptr2, #0x20]\n" + "mul v23.4s, v23.4s, v15.4s\n" + "ldr q27, [c_ptr2, #0x30]\n" + "mul v24.4s, v24.4s, v15.4s\n" + "ldr q28, [c_ptr3]\n" + "mul v25.4s, v25.4s, v15.4s\n" + "ldr q29, [c_ptr3, #0x10]\n" + "mul v26.4s, v26.4s, v15.4s\n" + "ldr q30, [c_ptr3, #0x20]\n" + "mul v27.4s, v27.4s, v15.4s\n" + "ldr q31, [c_ptr3, #0x30]\n" + "mul v28.4s, v28.4s, v15.4s\n" + "ldr q0, [%[a_ptr0]]\n" + "mul v29.4s, v29.4s, v15.4s\n" + "ldr q1, [a_ptr1]\n" + "mul v30.4s, v30.4s, v15.4s\n" + "ldr q2, [a_ptr2]\n" + "mul v31.4s, v31.4s, v15.4s\n" + "ldr q3, [a_ptr3]\n" + "ldr q8, [%[b_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "ins v14.d[1], temploadreg2\n" + "cbz %[loops], 2f\n" + "3:\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr d5, [a_ptr1]\n" + ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x8]\n" + ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" + "ldr d6, [a_ptr2]\n" + ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" + "ldr temploadreg2, [a_ptr2, #0x8]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d7, [a_ptr3]\n" + ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" + "ldr temploadreg3, [a_ptr3, #0x8]\n" + ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" + "ins v5.d[1], temploadreg1\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" + "ins v6.d[1], temploadreg2\n" + ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" + "ins v7.d[1], temploadreg3\n" + ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n" + "subs %[loops], %[loops], #0x1\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" + "ldr d0, [%[a_ptr0], #-0x10]\n" + ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" + ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" + "ins v0.d[1], temploadreg0\n" + ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" + "ldr d1, [a_ptr1, #-0x10]\n" + ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #-0x8]\n" + ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" + "ins v1.d[1], temploadreg1\n" + ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" + "add a_ptr2, a_ptr2, #0x20\n" + ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" + "ldr d2, [a_ptr2, #-0x10]\n" + ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n" + "ldr temploadreg2, [a_ptr2, #-0x8]\n" + ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" + "ins v2.d[1], temploadreg2\n" + ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" + "add a_ptr3, a_ptr3, #0x20\n" + ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" + "ldr d3, [a_ptr3, #-0x10]\n" + ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n" + "ldr temploadreg3, [a_ptr3, #-0x8]\n" + ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" + "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" + ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" + "ins v3.d[1], temploadreg3\n" + ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" + "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" + ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n" + "ins v12.d[1], temploadreg0\n" + "ins v13.d[1], temploadreg1\n" + "prfm PLDL1KEEP, [a_ptr3, #0x40]\n" + "ins v14.d[1], temploadreg2\n" + "b.ne 3b\n" + "2:\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + "prfm PSTL1KEEP, [c_ptr1]\n" + "prfm PSTL1KEEP, [c_ptr2]\n" + "prfm PSTL1KEEP, [c_ptr3]\n" + "ins v15.d[1], temploadreg3\n" + "cbz %[regs], 4f\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" + "ldr d5, [a_ptr1]\n" + ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x8]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr d6, [a_ptr2]\n" + ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + "ldr temploadreg2, [a_ptr2, #0x8]\n" + ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" + "ldr d7, [a_ptr3]\n" + ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" + "ldr temploadreg3, [a_ptr3, #0x8]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + "ins v5.d[1], temploadreg1\n" + ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" + "ins v6.d[1], temploadreg2\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" + "ins v7.d[1], temploadreg3\n" + ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" + "ldr d8, [%[b_ptr0], #-0x80]\n" + ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" + ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n" + "ldr d9, [%[b_ptr0], #-0x70]\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" + ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" + ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" + ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n" + "ldr d10, [%[b_ptr0], #-0x60]\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" + "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" + ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n" + "ldr d11, [%[b_ptr0], #-0x50]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" + ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" + "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" + ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n" + "ldr d12, [%[b_ptr0], #-0x40]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" + "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" + ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n" + "ldr d13, [%[b_ptr0], #-0x30]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + "ldr temploadreg0, [%[a_ptr0], #0x18]\n" + ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n" + "ldr d14, [%[b_ptr0], #-0x20]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + "ldr d0, [%[a_ptr0], #0x10]\n" + ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" + "ldr d1, [a_ptr1, #0x10]\n" + ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" + "ldr temploadreg1, [a_ptr1, #0x18]\n" + ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n" + "ldr d15, [%[b_ptr0], #-0x10]\n" + ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" + "ldr d2, [a_ptr2, #0x10]\n" + ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" + "ldr temploadreg2, [a_ptr2, #0x18]\n" + ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n" + "ldr d3, [a_ptr3, #0x10]\n" + ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" + "ldr temploadreg3, [a_ptr3, #0x18]\n" + ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n" + "ins v0.d[1], temploadreg0\n" + ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" + "ins v1.d[1], temploadreg1\n" + ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" + "ins v2.d[1], temploadreg2\n" + ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" + "ins v3.d[1], temploadreg3\n" + ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" + ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n" + ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" + ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" + ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" + ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n" + ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" + ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" + ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" + ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n" + ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" + ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" + ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" + ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n" + ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" + ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" + ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n" + ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" + ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" + ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n" + ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" + ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" + ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n" + ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" + ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" + ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" + ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n" + "b 5f\n" + "4:\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr d4, [%[a_ptr0]]\n" + ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr temploadreg0, [%[a_ptr0], #0x8]\n" + ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" + "ldr d5, [a_ptr1]\n" + ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" + "ldr temploadreg1, [a_ptr1, #0x8]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr d6, [a_ptr2]\n" + ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + "ldr temploadreg2, [a_ptr2, #0x8]\n" + ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" + "ldr d7, [a_ptr3]\n" + ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" + "ldr temploadreg3, [a_ptr3, #0x8]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "ldr d8, [%[b_ptr0]]\n" + ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + "ins v4.d[1], temploadreg0\n" + ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" + "ldr temploadreg0, [%[b_ptr0], #0x8]\n" + ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" + "ldr d9, [%[b_ptr0], #0x10]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + "ins v5.d[1], temploadreg1\n" + ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" + "ldr temploadreg1, [%[b_ptr0], #0x18]\n" + ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" + "ldr d10, [%[b_ptr0], #0x20]\n" + ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" + "ins v6.d[1], temploadreg2\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x28]\n" + ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" + "ldr d11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" + "ins v7.d[1], temploadreg3\n" + ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x38]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + "ldr d12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" + "ins v8.d[1], temploadreg0\n" + ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" + "ldr temploadreg0, [%[b_ptr0], #0x48]\n" + ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n" + "ldr d13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + "ins v9.d[1], temploadreg1\n" + ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" + "ldr temploadreg1, [%[b_ptr0], #0x58]\n" + ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" + "ins v10.d[1], temploadreg2\n" + ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n" + "ldr d14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + "ldr temploadreg2, [%[b_ptr0], #0x68]\n" + ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" + "ins v11.d[1], temploadreg3\n" + ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" + "ldr temploadreg3, [%[b_ptr0], #0x78]\n" + ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n" + "ldr d15, [%[b_ptr0], #0x70]\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "ins v12.d[1], temploadreg0\n" + ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" + "ins v13.d[1], temploadreg1\n" + ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" + "ins v14.d[1], temploadreg2\n" + ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n" + "ins v15.d[1], temploadreg3\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" + ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" + ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" + ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" + ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" + ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" + ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" + ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" + ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" + ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" + ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" + ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n" + "5:\n" + "str q16, [%[c_ptr0]]\n" + "str q17, [%[c_ptr0], #0x10]\n" + "str q18, [%[c_ptr0], #0x20]\n" + "str q19, [%[c_ptr0], #0x30]\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + "str q20, [c_ptr1]\n" + "str q21, [c_ptr1, #0x10]\n" + "str q22, [c_ptr1, #0x20]\n" + "str q23, [c_ptr1, #0x30]\n" + "str q24, [c_ptr2]\n" + "str q25, [c_ptr2, #0x10]\n" + "str q26, [c_ptr2, #0x20]\n" + "str q27, [c_ptr2, #0x30]\n" + "str q28, [c_ptr3]\n" + "str q29, [c_ptr3, #0x10]\n" + "str q30, [c_ptr3, #0x20]\n" + "str q31, [c_ptr3, #0x30]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq temploadreg0\n" + ".unreq temploadreg1\n" + ".unreq temploadreg2\n" + ".unreq temploadreg3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory" + ); + break; + } + } + } +} + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp new file mode 100644 index 0000000000..dbef02985f --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp @@ -0,0 +1,1605 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include <algorithm> + +#include <cstdint> +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) { + const long beta0 = (beta == 0u); + const int K_stride = ((K + 3) / 4) * 4; + const long loops_count = ((K + 16) / 32) - 1; + K -= loops_count * 32; + const long regs_count = (K / 16) - 1; + + for (int y=0; y<M; y+=4) { + const uint8_t * const a_ptr0_base = A + (y * lda); + const unsigned long ldab = lda * sizeof(uint8_t); + + uint32_t *c_ptr0 = C + (y * ldc); + const unsigned long ldcb = ldc * sizeof(uint32_t); + + for (int x0=0; x0<N; x0+=16ul) { + const long width = std::min((unsigned long)N-x0, 16ul); + const uint32_t *betaptr = β + long loops = loops_count; + long regs = regs_count; + const uint8_t *a_ptr0 = a_ptr0_base; + const uint8_t *b_ptr0 = B + (K_stride * x0); + + switch(M-y) { + case 1: + __asm __volatile ( + "cbz %[beta0], 1f\n" + "movi v16.4s, #0\n" + "ldr q0, [%[a_ptr0]]\n" + "movi v17.4s, #0\n" + "ldr q8, [%[b_ptr0]]\n" + "movi v18.4s, #0\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "movi v19.4s, #0\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1r {v15.4s}, [%[betaptr]]\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "mul v16.4s, v16.4s, v15.4s\n" + "ldr q0, [%[a_ptr0]]\n" + "mul v17.4s, v17.4s, v15.4s\n" + "ldr q8, [%[b_ptr0]]\n" + "mul v18.4s, v18.4s, v15.4s\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "mul v19.4s, v19.4s, v15.4s\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "3:\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + "subs %[loops], %[loops], #0x1\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" + "ldr q0, [%[a_ptr0], #-0x10]\n" + ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + "b.ne 3b\n" + "2:\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "cbz %[regs], 4f\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" + "ldr q0, [%[a_ptr0], #0x10]\n" + ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" + ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" + "b 5f\n" + "4:\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + "5:\n" + "str q16, [%[c_ptr0]]\n" + "str q17, [%[c_ptr0], #0x10]\n" + "str q18, [%[c_ptr0], #0x20]\n" + "str q19, [%[c_ptr0], #0x30]\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "c_ptr1 .req X1\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "cbz %[beta0], 1f\n" + "movi v16.4s, #0\n" + "ldr q0, [%[a_ptr0]]\n" + "movi v17.4s, #0\n" + "ldr q1, [a_ptr1]\n" + "movi v18.4s, #0\n" + "ldr q8, [%[b_ptr0]]\n" + "movi v19.4s, #0\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "movi v20.4s, #0\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "movi v21.4s, #0\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "movi v22.4s, #0\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "movi v23.4s, #0\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1r {v15.4s}, [%[betaptr]]\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "mul v16.4s, v16.4s, v15.4s\n" + "ldr q20, [c_ptr1]\n" + "mul v17.4s, v17.4s, v15.4s\n" + "ldr q21, [c_ptr1, #0x10]\n" + "mul v18.4s, v18.4s, v15.4s\n" + "ldr q22, [c_ptr1, #0x20]\n" + "mul v19.4s, v19.4s, v15.4s\n" + "ldr q23, [c_ptr1, #0x30]\n" + "mul v20.4s, v20.4s, v15.4s\n" + "ldr q0, [%[a_ptr0]]\n" + "mul v21.4s, v21.4s, v15.4s\n" + "ldr q1, [a_ptr1]\n" + "mul v22.4s, v22.4s, v15.4s\n" + "ldr q8, [%[b_ptr0]]\n" + "mul v23.4s, v23.4s, v15.4s\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "3:\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q5, [a_ptr1]\n" + ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + "subs %[loops], %[loops], #0x1\n" + ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" + ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + "ldr q0, [%[a_ptr0], #-0x10]\n" + ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" + "ldr q1, [a_ptr1, #-0x10]\n" + ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" + ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" + ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" + ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" + ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" + ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" + ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" + ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" + ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" + ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" + ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" + ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" + "b.ne 3b\n" + "2:\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "prfm PSTL1KEEP, [c_ptr1]\n" + "cbz %[regs], 4f\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q5, [a_ptr1]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + "ldr q0, [%[a_ptr0], #0x10]\n" + ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" + "ldr q1, [a_ptr1, #0x10]\n" + ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" + ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" + ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" + ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" + ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" + ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" + ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" + ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" + ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" + ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" + ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" + ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" + ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" + ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" + ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" + ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" + ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" + ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" + ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" + ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" + "b 5f\n" + "4:\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q5, [a_ptr1]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" + "5:\n" + "str q16, [%[c_ptr0]]\n" + "str q17, [%[c_ptr0], #0x10]\n" + "str q18, [%[c_ptr0], #0x20]\n" + "str q19, [%[c_ptr0], #0x30]\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + "str q20, [c_ptr1]\n" + "str q21, [c_ptr1, #0x10]\n" + "str q22, [c_ptr1, #0x20]\n" + "str q23, [c_ptr1, #0x30]\n" + ".unreq a_ptr1\n" + ".unreq c_ptr1\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "c_ptr1 .req X2\n" + "c_ptr2 .req X3\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "cbz %[beta0], 1f\n" + "movi v16.4s, #0\n" + "ldr q0, [%[a_ptr0]]\n" + "movi v17.4s, #0\n" + "ldr q1, [a_ptr1]\n" + "movi v18.4s, #0\n" + "ldr q2, [a_ptr2]\n" + "movi v19.4s, #0\n" + "ldr q8, [%[b_ptr0]]\n" + "movi v20.4s, #0\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "movi v21.4s, #0\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "movi v22.4s, #0\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "movi v23.4s, #0\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "movi v24.4s, #0\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "movi v25.4s, #0\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + "movi v26.4s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "movi v27.4s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add a_ptr2, a_ptr2, #0x10\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1r {v15.4s}, [%[betaptr]]\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "mul v16.4s, v16.4s, v15.4s\n" + "ldr q20, [c_ptr1]\n" + "mul v17.4s, v17.4s, v15.4s\n" + "ldr q21, [c_ptr1, #0x10]\n" + "mul v18.4s, v18.4s, v15.4s\n" + "ldr q22, [c_ptr1, #0x20]\n" + "mul v19.4s, v19.4s, v15.4s\n" + "ldr q23, [c_ptr1, #0x30]\n" + "mul v20.4s, v20.4s, v15.4s\n" + "ldr q24, [c_ptr2]\n" + "mul v21.4s, v21.4s, v15.4s\n" + "ldr q25, [c_ptr2, #0x10]\n" + "mul v22.4s, v22.4s, v15.4s\n" + "ldr q26, [c_ptr2, #0x20]\n" + "mul v23.4s, v23.4s, v15.4s\n" + "ldr q27, [c_ptr2, #0x30]\n" + "mul v24.4s, v24.4s, v15.4s\n" + "ldr q0, [%[a_ptr0]]\n" + "mul v25.4s, v25.4s, v15.4s\n" + "ldr q1, [a_ptr1]\n" + "mul v26.4s, v26.4s, v15.4s\n" + "ldr q2, [a_ptr2]\n" + "mul v27.4s, v27.4s, v15.4s\n" + "ldr q8, [%[b_ptr0]]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "3:\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" + "ldr q5, [a_ptr1]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q6, [a_ptr2]\n" + ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "subs %[loops], %[loops], #0x1\n" + ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + "add a_ptr2, a_ptr2, #0x20\n" + ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" + "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" + ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" + ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" + ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" + ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" + ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" + ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" + ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" + ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" + ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + "ldr q0, [%[a_ptr0], #-0x10]\n" + ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" + "ldr q1, [a_ptr1, #-0x10]\n" + ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" + "ldr q2, [a_ptr2, #-0x10]\n" + ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" + ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" + ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" + ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" + ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" + ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" + ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" + ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" + ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" + ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" + ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" + ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" + ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" + ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" + ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" + ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" + ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" + ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" + ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" + ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" + ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" + ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" + ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" + ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" + ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" + ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" + ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" + ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" + "b.ne 3b\n" + "2:\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "prfm PSTL1KEEP, [c_ptr1]\n" + "prfm PSTL1KEEP, [c_ptr2]\n" + "cbz %[regs], 4f\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q5, [a_ptr1]\n" + ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" + "ldr q6, [a_ptr2]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" + ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" + ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" + ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" + ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" + ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" + ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" + ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" + ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" + ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + "ldr q0, [%[a_ptr0], #0x10]\n" + ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" + "ldr q1, [a_ptr1, #0x10]\n" + ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" + "ldr q2, [a_ptr2, #0x10]\n" + ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" + ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" + ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" + ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" + ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" + ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" + ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" + ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" + ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" + ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" + ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" + ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" + ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" + ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" + ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" + ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" + ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" + ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" + ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" + ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" + ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" + ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" + ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" + ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" + ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" + ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" + ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" + ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" + ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" + ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" + ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" + ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" + ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" + ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" + ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" + ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" + "b 5f\n" + "4:\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q5, [a_ptr1]\n" + ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" + "ldr q6, [a_ptr2]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" + ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" + ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" + ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" + ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" + ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" + ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" + ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" + ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" + ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" + ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" + "5:\n" + "str q16, [%[c_ptr0]]\n" + "str q17, [%[c_ptr0], #0x10]\n" + "str q18, [%[c_ptr0], #0x20]\n" + "str q19, [%[c_ptr0], #0x30]\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + "str q20, [c_ptr1]\n" + "str q21, [c_ptr1, #0x10]\n" + "str q22, [c_ptr1, #0x20]\n" + "str q23, [c_ptr1, #0x30]\n" + "str q24, [c_ptr2]\n" + "str q25, [c_ptr2, #0x10]\n" + "str q26, [c_ptr2, #0x20]\n" + "str q27, [c_ptr2, #0x30]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory" + ); + break; + default: + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "cbz %[beta0], 1f\n" + "movi v16.4s, #0\n" + "ldr q0, [%[a_ptr0]]\n" + "movi v17.4s, #0\n" + "ldr q1, [a_ptr1]\n" + "movi v18.4s, #0\n" + "ldr q2, [a_ptr2]\n" + "movi v19.4s, #0\n" + "ldr q3, [a_ptr3]\n" + "movi v20.4s, #0\n" + "ldr q8, [%[b_ptr0]]\n" + "movi v21.4s, #0\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "movi v22.4s, #0\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "movi v23.4s, #0\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "movi v24.4s, #0\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "movi v25.4s, #0\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "movi v26.4s, #0\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + "movi v27.4s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "movi v28.4s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "movi v29.4s, #0\n" + "add a_ptr2, a_ptr2, #0x10\n" + "movi v30.4s, #0\n" + "add a_ptr3, a_ptr3, #0x10\n" + "movi v31.4s, #0\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1r {v15.4s}, [%[betaptr]]\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "mul v16.4s, v16.4s, v15.4s\n" + "ldr q20, [c_ptr1]\n" + "mul v17.4s, v17.4s, v15.4s\n" + "ldr q21, [c_ptr1, #0x10]\n" + "mul v18.4s, v18.4s, v15.4s\n" + "ldr q22, [c_ptr1, #0x20]\n" + "mul v19.4s, v19.4s, v15.4s\n" + "ldr q23, [c_ptr1, #0x30]\n" + "mul v20.4s, v20.4s, v15.4s\n" + "ldr q24, [c_ptr2]\n" + "mul v21.4s, v21.4s, v15.4s\n" + "ldr q25, [c_ptr2, #0x10]\n" + "mul v22.4s, v22.4s, v15.4s\n" + "ldr q26, [c_ptr2, #0x20]\n" + "mul v23.4s, v23.4s, v15.4s\n" + "ldr q27, [c_ptr2, #0x30]\n" + "mul v24.4s, v24.4s, v15.4s\n" + "ldr q28, [c_ptr3]\n" + "mul v25.4s, v25.4s, v15.4s\n" + "ldr q29, [c_ptr3, #0x10]\n" + "mul v26.4s, v26.4s, v15.4s\n" + "ldr q30, [c_ptr3, #0x20]\n" + "mul v27.4s, v27.4s, v15.4s\n" + "ldr q31, [c_ptr3, #0x30]\n" + "mul v28.4s, v28.4s, v15.4s\n" + "ldr q0, [%[a_ptr0]]\n" + "mul v29.4s, v29.4s, v15.4s\n" + "ldr q1, [a_ptr1]\n" + "mul v30.4s, v30.4s, v15.4s\n" + "ldr q2, [a_ptr2]\n" + "mul v31.4s, v31.4s, v15.4s\n" + "ldr q3, [a_ptr3]\n" + "ldr q8, [%[b_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + "add %[b_ptr0], %[b_ptr0], #0x80\n" + "cbz %[loops], 2f\n" + "3:\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" + "ldr q5, [a_ptr1]\n" + ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" + "ldr q6, [a_ptr2]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q7, [a_ptr3]\n" + ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" + "subs %[loops], %[loops], #0x1\n" + ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + "add a_ptr2, a_ptr2, #0x20\n" + ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" + "add a_ptr3, a_ptr3, #0x20\n" + ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" + "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" + ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" + ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" + "prfm PLDL1KEEP, [a_ptr3, #0x40]\n" + ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" + ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" + ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" + ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" + ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" + ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" + ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" + ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" + ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" + ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" + ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" + ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" + ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" + ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" + ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" + ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" + ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" + ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" + ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + "ldr q0, [%[a_ptr0], #-0x10]\n" + ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" + "ldr q1, [a_ptr1, #-0x10]\n" + ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" + "ldr q2, [a_ptr2, #-0x10]\n" + ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" + "ldr q3, [a_ptr3, #-0x10]\n" + ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" + ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" + ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" + ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" + ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" + ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" + ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" + ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" + ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" + ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" + ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" + ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" + ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" + ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" + ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" + ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" + ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" + ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" + ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" + ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" + ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" + ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" + ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" + ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" + ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" + ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" + ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" + ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" + ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" + ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" + ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" + ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" + ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" + ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" + ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" + ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" + ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" + ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" + ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" + ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" + ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" + ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" + ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" + ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n" + "b.ne 3b\n" + "2:\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "prfm PSTL1KEEP, [c_ptr1]\n" + "prfm PSTL1KEEP, [c_ptr2]\n" + "prfm PSTL1KEEP, [c_ptr3]\n" + "cbz %[regs], 4f\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q5, [a_ptr1]\n" + ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" + "ldr q6, [a_ptr2]\n" + ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" + "ldr q7, [a_ptr3]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" + ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" + ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" + ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" + ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" + ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" + ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" + ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" + ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" + ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" + ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" + ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" + ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" + ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" + ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" + ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" + ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" + ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" + ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" + ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" + ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" + ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n" + "ldr q12, [%[b_ptr0], #-0x40]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" + ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n" + "ldr q13, [%[b_ptr0], #-0x30]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" + ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n" + "ldr q14, [%[b_ptr0], #-0x20]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + "ldr q0, [%[a_ptr0], #0x10]\n" + ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" + "ldr q1, [a_ptr1, #0x10]\n" + ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" + "ldr q2, [a_ptr2, #0x10]\n" + ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n" + "ldr q15, [%[b_ptr0], #-0x10]\n" + ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" + "ldr q3, [a_ptr3, #0x10]\n" + ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" + ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" + ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" + ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" + ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" + ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" + ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" + ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" + ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" + ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" + ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" + ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" + ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" + ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" + ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" + ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" + ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" + ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" + ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" + ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" + ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" + ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" + ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" + ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" + ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" + ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" + ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n" + ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" + ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" + ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" + ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n" + ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" + ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" + ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" + ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n" + ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" + ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" + ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" + ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n" + ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" + ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" + ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" + ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n" + ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" + ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" + ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" + ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n" + ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" + ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" + ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" + ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n" + ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" + ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" + ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" + ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n" + "b 5f\n" + "4:\n" + ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" + "ldr q4, [%[a_ptr0]]\n" + ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" + "ldr q5, [a_ptr1]\n" + ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" + "ldr q6, [a_ptr2]\n" + ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" + "ldr q7, [a_ptr3]\n" + ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" + "ldr q8, [%[b_ptr0]]\n" + ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" + ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" + ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" + ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" + ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" + ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" + ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" + ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" + ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" + ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" + ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" + ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n" + "ldr q12, [%[b_ptr0], #0x40]\n" + ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" + ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" + ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" + ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n" + "ldr q13, [%[b_ptr0], #0x50]\n" + ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" + ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" + ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" + ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n" + "ldr q14, [%[b_ptr0], #0x60]\n" + ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" + ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" + ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" + ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n" + "ldr q15, [%[b_ptr0], #0x70]\n" + ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" + ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" + ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" + ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n" + ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" + ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" + ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" + ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n" + ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" + ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" + ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" + ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n" + ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" + ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" + ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" + ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n" + ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" + ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" + ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" + ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n" + ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" + ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" + ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" + ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n" + ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" + ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" + ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" + ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n" + ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" + ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" + ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" + ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n" + "5:\n" + "str q16, [%[c_ptr0]]\n" + "str q17, [%[c_ptr0], #0x10]\n" + "str q18, [%[c_ptr0], #0x20]\n" + "str q19, [%[c_ptr0], #0x30]\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + "str q20, [c_ptr1]\n" + "str q21, [c_ptr1, #0x10]\n" + "str q22, [c_ptr1, #0x20]\n" + "str q23, [c_ptr1, #0x30]\n" + "str q24, [c_ptr2]\n" + "str q25, [c_ptr2, #0x10]\n" + "str q26, [c_ptr2, #0x20]\n" + "str q27, [c_ptr2, #0x30]\n" + "str q28, [c_ptr3]\n" + "str q29, [c_ptr3, #0x10]\n" + "str q30, [c_ptr3, #0x20]\n" + "str q31, [c_ptr3, #0x30]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" + ); + break; + } + } + } +} + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp index 10d1069417..3c0395a337 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -51,15 +51,15 @@ public: typedef void (*kern_type)(const float *, const float *, float *, int, int, int); /* Kernel blocking parameters */ - static int out_width() { + static unsigned int out_width() { return 12; } - static int out_height() { + static unsigned int out_height() { return 8; } - static int k_unroll() { + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp index 0c387ff6df..95e3712e84 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -51,15 +51,15 @@ public: static const bool B_transpose = true; /* Kernel blocking parameters */ - static int out_width() { + static unsigned int out_width() { return 16; } - static int out_height() { + static unsigned int out_height() { return 4; } - static int k_unroll() { + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp index 1a3596511b..3d2b324314 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -46,15 +46,15 @@ public: typedef void (*kern_type)(const float *, int, const float *, int, float *, int, float, int, int, int); /* Kernel blocking parameters */ - static int out_width() { + static unsigned int out_width() { return 16; } - static int out_height() { + static unsigned int out_height() { return 4; } - static int k_unroll() { + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp index a73bc76b5d..f5b4f4aa19 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -46,13 +46,26 @@ public: * terms of this standard arrangement, so if the A matrix is in fact the * B matrix from a GEMM call, the sense of the transpose needs to be * reversed. */ - static const int A_interleave = 32; - static const int A_block = 1; - static const bool A_transpose = false; + static constexpr unsigned int A_interleave() { + return 32; + } + + static constexpr unsigned int A_block() { + return 1; + } + + static constexpr bool A_transpose() { + return false; + } /* Kernel blocking parameters */ - static const int out_width = 32; - static const int k_unroll = 1; + static constexpr unsigned int out_width() { + return 32; + } + + static constexpr unsigned int k_unroll() { + return 1; + } kern_type kernel = a64_sgemv_pretransposed; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp index 18c5c3a6dc..cbaa0cfb1b 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -39,8 +39,13 @@ public: typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int); /* Kernel blocking parameters */ - static const int out_width = 96; - static const int k_unroll = 1; + static unsigned int out_width() { + return 96; + } + + static unsigned int k_unroll() { + return 1; + } kern_type kernel=a64_sgemv_trans; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp index 2b58b110c0..76f452d963 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -43,17 +43,17 @@ public: typedef void (*kern_type)(const float *, int, const float *, float *, int, float, int, int, int); /* Kernel blocking parameters */ - static int out_height() + static unsigned int out_height() { return 4; } - static int out_width() + static unsigned int out_width() { return get_vector_length<float>() * 4; } - static int k_unroll() + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp index 9d88b60cee..2ca4ce25e8 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp @@ -41,17 +41,17 @@ public: typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int); /* Kernel blocking parameters */ - static int out_width() + static unsigned int out_width() { return get_vector_length<__fp16>() * 3; } - static int out_height() + static unsigned int out_height() { return 8; } - static int k_unroll() + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp index 2e8f261fe1..8c1fe6d0b6 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp @@ -41,17 +41,17 @@ public: typedef void (*kern_type)(const float *, const float *, float *, int, int, int); /* Kernel blocking parameters */ - static int out_width() + static unsigned int out_width() { return get_vector_length<float>() * 3; } - static int out_height() + static unsigned int out_height() { return 8; } - static int k_unroll() + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp index 67154e6a3f..cbb21387b1 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp @@ -41,17 +41,17 @@ public: typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); /* Kernel blocking parameters */ - static int out_width() + static unsigned int out_width() { return get_vector_length<int32_t>() * 3; } - static int out_height() + static unsigned int out_height() { return 8; } - static int k_unroll() + static unsigned int k_unroll() { return 4; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp index 628c5a868e..99c039e121 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp @@ -41,17 +41,17 @@ public: typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); /* Kernel blocking parameters */ - static int out_width() + static unsigned int out_width() { return get_vector_length<uint32_t>() * 3; } - static int out_height() + static unsigned int out_height() { return 8; } - static int k_unroll() + static unsigned int k_unroll() { return 4; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp index fcc80d9fe5..d7f9f20074 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -42,17 +42,17 @@ public: typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, float, int, int, int); /* Kernel blocking parameters */ - static int out_height() + static unsigned int out_height() { return 4; } - static int out_width() + static unsigned int out_width() { return get_vector_length<float>() * 4; } - static int k_unroll() + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp index f5634e3618..8b98358cd4 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -42,17 +42,17 @@ public: typedef void (*kern_type)(const int8_t *, int, const int8_t *, int ldb, int32_t *, int, int32_t, int, int, int); /* Kernel blocking parameters */ - static int out_height() + static unsigned int out_height() { return 4; } - static int out_width() + static unsigned int out_width() { return get_vector_length<int32_t>() * 4; } - static int k_unroll() + static unsigned int k_unroll() { return 4; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp index f5ebad8565..bcbd3d35f5 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -43,17 +43,17 @@ public: typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, int ldb, uint32_t *, int, uint32_t, int, int, int); /* Kernel blocking parameters */ - static int out_height() + static unsigned int out_height() { return 4; } - static int out_width() + static unsigned int out_width() { return get_vector_length<uint32_t>() * 4; } - static int k_unroll() + static unsigned int k_unroll() { return 4; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp index 80b216ca14..06622d6f2e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -42,17 +42,17 @@ public: typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, float, int, int, int); /* Kernel blocking parameters */ - static int out_height() + static unsigned int out_height() { return 4; } - static int out_width() + static unsigned int out_width() { return get_vector_length<float>() * 1; } - static int k_unroll() + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp index aa2c522382..022efdfc26 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -42,17 +42,17 @@ public: typedef void (*kern_type)(const float *, int, const float *, float *, int, float, int, int, int); /* Kernel blocking parameters */ - static int out_height() + static unsigned int out_height() { return 4; } - static int out_width() + static unsigned int out_width() { return get_vector_length<float>() * 1; } - static int k_unroll() + static unsigned int k_unroll() { return 1; } diff --git a/src/core/NEON/kernels/arm_gemm/ndrange.hpp b/src/core/NEON/kernels/arm_gemm/ndrange.hpp new file mode 100644 index 0000000000..20824dfc8b --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/ndrange.hpp @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include <algorithm> +#include <initializer_list> + +namespace arm_gemm { + +template<unsigned int D> +class NDRange { +private: + unsigned int m_sizes[D]; + unsigned int m_totalsizes[D]; + + class NDRangeIterator { + private: + const NDRange &m_parent; + unsigned int m_pos = 0; + unsigned int m_end = 0; + + public: + NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e) { } + + bool done() const { + return (m_pos >= m_end); + } + + unsigned int dim(unsigned int d) const { + unsigned int r = m_pos; + + if (d < (D - 1)) { + r %= m_parent.m_totalsizes[d]; + } + + if (d > 0) { + r /= m_parent.m_totalsizes[d-1]; + } + + return r; + } + + bool next_dim0() { + m_pos++; + + return !done(); + } + + bool next_dim1() { + m_pos += m_parent.m_sizes[0] - dim(0); + + return !done(); + } + + unsigned int dim0_max() const { + unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0)); + + return dim(0) + offset; + } + }; + +public: + template <typename... T> + NDRange(T... ts) : m_sizes{ts...} { + unsigned int t=1; + + for (unsigned int i=0; i<D; i++) { + t *= m_sizes[i]; + + m_totalsizes[i] = t; + } + } + + NDRangeIterator iterator(unsigned int start, unsigned int end) const { + return NDRangeIterator(*this, start, end); + } + + unsigned int total_size() const { + return m_totalsizes[D - 1]; + } + + unsigned int get_size(unsigned int v) const { + return m_sizes[v]; + } +}; + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp index 8b96c328a6..f0707800cf 100644 --- a/src/core/NEON/kernels/arm_gemm/utils.hpp +++ b/src/core/NEON/kernels/arm_gemm/utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,7 +32,8 @@ // Paranoid option for the above with assert // #define UNREACHABLE(why) assert(0 && why) -inline int iceildiv(const int a, const int b) { +template<typename T> +inline T iceildiv(const T a, const T b) { return (a + b - 1) / b; } |