aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2019-01-09 18:35:17 +0000
committerGeorgios Pinitas <georgios.pinitas@arm.com>2019-01-18 13:41:40 +0000
commit7cd26d4a1b14bc4bf7c61496803416ab3d84791f (patch)
tree12cc4a27d7ecebc69a43e96b1f46c7eb05437978
parent3ac2f3a1d9297220d1b0ce920dd13fdd4edcc187 (diff)
downloadComputeLibrary-7cd26d4a1b14bc4bf7c61496803416ab3d84791f.tar.gz
COMPMID-1867: Add NEON/SVE GEMM Hybrid kernels.
Change-Id: Ib40a9921e7f9a6a8be6c38872d6b3a0f24ed0cd3 Reviewed-on: https://review.mlplatform.org/515 Reviewed-by: Anthony Barbier <Anthony.barbier@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/core/NEON/kernels/assembly/Helpers.h41
-rw-r--r--arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h130
-rw-r--r--arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h140
-rw-r--r--arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h71
-rw-r--r--arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp88
-rw-r--r--arm_compute/core/NEON/kernels/assembly/gemm_common.hpp11
-rw-r--r--arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h17
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp92
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp211
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp303
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp153
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_int16.cpp39
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_int8.cpp90
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp71
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_native.hpp21
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp41
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp86
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemv_batched.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp9
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp21
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp78
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp970
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp74
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp2005
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp48
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp46
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp46
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp73
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp2066
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp73
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp4632
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp74
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp4632
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp73
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp4264
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp73
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp4004
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp1660
-rw-r--r--src/core/NEON/kernels/arm_gemm/transform.hpp13
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/list.hpp15
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp470
-rw-r--r--src/core/NEON/kernels/arm_gemm/utils.hpp44
-rw-r--r--src/core/NEON/kernels/assembly/Helpers.cpp100
-rw-r--r--src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp152
-rw-r--r--src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp189
-rw-r--r--src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h239
-rw-r--r--src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp118
-rw-r--r--src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp6
-rw-r--r--src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp107
-rw-r--r--src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp142
56 files changed, 26516 insertions, 1637 deletions
diff --git a/arm_compute/core/NEON/kernels/assembly/Helpers.h b/arm_compute/core/NEON/kernels/assembly/Helpers.h
index 11c4c08086..e2a46e96a3 100644
--- a/arm_compute/core/NEON/kernels/assembly/Helpers.h
+++ b/arm_compute/core/NEON/kernels/assembly/Helpers.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,9 @@
#include "arm_compute/core/CPP/CPPTypes.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
+#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
+
namespace arm_compute
{
/** Block sizes to use to break the M, N, K dimension */
@@ -38,31 +41,29 @@ struct BlockSizes
unsigned int strategy_out_height{ 0 }; /**< Number of rows (M) processed by the selected strategy */
};
-/** Calculate the recommended block sizes to use based on the CPU cache sizes and data type
- *
- * @param[in] ci CPU information
- * @param[in] M M dimension.
- * @param[in] N N dimension.
- * @param[in] K K dimension.
- * @param[in] input_type Input data type
- * @param[in] use_dot (Optional) If data_type is QASYMM8/U8/S8, then use the dot product instruction ?
- *
- * @return Recommeded block sizes to use for the given M, N, K dimensions.
- */
-BlockSizes calculate_block_sizes_from_data_type(const CPUInfo &ci, unsigned int M, unsigned int N, unsigned int K, DataType input_type, bool use_dot = false);
-
-/** Get the name of the GEMM strategy which will be used for a given input type
+/** Extracts the kernel description of the selected kernel by the GEMM backend heuristics
*
- * @param[in] input_type Input data type
- * @param[in] use_dot (Optional) If data_type is QASYMM8/U8/S8, then use the dot product instruction ?
+ * @param[in] input_type Data type of the input tensor.
+ * @param[in] ci CPU information.
+ * @param[in] num_threads Maximum number of threads that might be used for the calculations.
+ * @param[in] p M, N, K sizes.
+ * @param[in] alpha Alpha value.
+ * @param[in] beta Beta value.
+ * @param[in] pretranspose_hint Is B also pretransposed ?
*
- * @return The name of the strategy that will be used
+ * @return Kernel description that the assembly heuristics picked for the given configuration
*/
-const char *get_strategy_name(DataType input_type, bool use_dot = false);
+arm_gemm::KernelDescription get_gemm_info(DataType input_type,
+ const CPUInfo &ci,
+ const unsigned int num_threads,
+ const INEGEMMWrapperKernel::Params &p,
+ float alpha,
+ float beta,
+ bool pretranspose_hint);
/** Calculate the recommended block sizes to use based on the CPU cache sizes and the strategy which will be used
*
- * @param[in] ci CPU information
+ * @param[in] ci CPU information.
* @param[in] M M dimension.
* @param[in] N N dimension.
* @param[in] K K dimension.
diff --git a/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h b/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h
index 46a05abcdb..e2b849aa3d 100644
--- a/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h
+++ b/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,8 +26,13 @@
#include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/WindowIterator.h"
namespace arm_compute
{
@@ -84,7 +89,7 @@ public:
};
/** Equivalent to arm_gemm::GemmInterleaved's strategy::kernel() but using Compute Library types. */
-template <typename To, typename Tr, bool use_dot = false>
+template <typename strategy>
class NEGEMMInterleavedMatrixMultiplyWrapperTemplate : public NEGEMMInterleavedMatrixMultiplyWrapper
{
public:
@@ -94,7 +99,7 @@ public:
* @param[in] transformed_b Already reshaped matrix B.
* @param[out] tmp_c Temporary buffer to be used to store intermediate results.
* @param[in,out] c Result matrix C.
- * @param[in] batch_window Window containing iteration information for the M and batch dimensions.
+ * @param[in] block_walker Window containing iteration information for the M and batch dimensions.
* @param[in] block_sizes Block sizes to use for the matrix multiplication (A & B must have been reshaped using these same block sizes).
* @param[in] params M, N, K sizes.
* @param[in] is_pretransposed Is B also pretransposed ?
@@ -102,30 +107,117 @@ public:
* @param[in] beta Beta value
* @param[in] max_num_threads Maximum number of threads that might be used for the calculations.
*/
- void configure(const ITensor *prepared_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &batch_window, const BlockSizes &block_sizes,
- const INEGEMMWrapperKernel::Params &params, bool b_is_pretransposed, float alpha, float beta, unsigned int max_num_threads);
+ void configure(const ITensor *prepared_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker, const BlockSizes &block_sizes,
+ const INEGEMMWrapperKernel::Params &params, bool b_is_pretransposed, float alpha, float beta, unsigned int max_num_threads)
+ {
+ _prepared_a = prepared_a;
+ _transformed_b = transformed_b;
+ _tmp_c = tmp_c;
+ _c = c;
+ _block_walker = block_walker;
+ _block_sizes = block_sizes;
+ _params = params;
+ _b_is_pretransposed = b_is_pretransposed;
+ _alpha = alpha;
+ _beta = beta;
+
+ auto_init_if_empty(*_tmp_c->info(), c->info()->clone()->set_tensor_shape(TensorShape{ _block_sizes.x_block * strategy::out_height(), max_num_threads }));
+ }
// Inherited methods overridden:
- void transform(const MatrixMultiplyWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset, const Coordinates &end_offset) override;
- void create_workloads(std::vector<MatrixMultiplyWorkload> &workloads) override;
+ void transform(const MatrixMultiplyWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset, const Coordinates &end_offset) override
+ {
+ strategy strat(info.cpu_info);
+ TensorAccessor<typename strategy::operand_type> prepared_a(*_prepared_a);
+ TensorAccessor<typename strategy::operand_type> transformed_b(*_transformed_b);
+ TensorAccessor<typename strategy::result_type> c(*_c);
+ TensorAccessor<typename strategy::result_type> tmp_c(*_tmp_c);
+
+ int prev_batch = -1;
+ typename strategy::operand_type *a_ptr = nullptr;
+ auto window_iterator = arm_compute::create_window_iterator(batch_window, start_offset, end_offset, [&](const Coordinates & id)
+ {
+ const unsigned int y = id.x();
+ const unsigned int batch = id.y();
+ const unsigned int ymax = std::min(_params.M, y + strategy::out_height());
+
+ // If it's the first block of a new batch then reset the pointer to A.
+ if(prev_batch != static_cast<int>(batch))
+ {
+ const unsigned int first_m = id.x();
+ a_ptr = prepared_a(0, first_m, batch);
+ prev_batch = batch;
+ }
+
+ // Call matrix multiply assembly routine to process the block:
+ strat.kernel(a_ptr, transformed_b(wl._offset_transformed_b), tmp_c(0, info.thread_id), 1, wl._bblocks, wl._kern_k);
+ a_ptr += strategy::out_height() * wl._kern_k;
+
+ // Merge the result with the other blocks' results:
+ strat.transforms.Merge(c(0, 0, batch, wl._multi), tmp_c(0, info.thread_id), c.stride(1), y, ymax, wl._x0, wl._xmax, _alpha, (wl._k0 == 0 ? _beta : static_cast<typename strategy::result_type>(1)));
+ });
+ auto on_new_row_size = [&](unsigned int start, unsigned int end)
+ {
+ //Nothing to do
+ };
+ window_iterator.iterate_2D(on_new_row_size);
+ }
+ void create_workloads(std::vector<MatrixMultiplyWorkload> &workloads) override
+ {
+ unsigned int offset_transformed_b = 0;
+ unsigned int wl_index = 0;
+ unsigned int num_buffers = 0, reshaped_block_size = 0;
+
+ if(!_b_is_pretransposed)
+ {
+ num_buffers = _transformed_b->info()->tensor_shape()[1];
+ reshaped_block_size = _transformed_b->info()->tensor_shape()[0];
+ }
+ execute_window_loop(_block_walker, [&](const Coordinates & id)
+ {
+ const unsigned int x0 = id.x();
+ const unsigned int k0 = id.y();
+ const unsigned int multi = id.z();
+
+ const unsigned int xmax = std::min(x0 + _block_walker.x().step(), _params.N);
+ const unsigned int kmax = std::min(k0 + _block_walker.y().step(), _params.K);
+
+ // Figure out how many "K" the kernel will actually process.
+ const int kern_k = ceil_to_multiple(kmax - k0, strategy::k_unroll());
+ const int bblocks = DIV_CEIL(xmax - x0, strategy::out_width());
+
+ workloads.push_back(MatrixMultiplyWorkload(offset_transformed_b, x0, xmax, k0, kmax, multi, kern_k, bblocks));
+
+ if(_b_is_pretransposed)
+ {
+ offset_transformed_b += bblocks * strategy::out_width() * kern_k;
+ }
+ else
+ {
+ // Rotate through the BufferManager's buffers:
+ wl_index++;
+ offset_transformed_b = (wl_index % num_buffers) * reshaped_block_size;
+ }
+ });
+ }
private:
const ITensor *_prepared_a
{
nullptr
};
- const ITensor *_transformed_b{ nullptr };
- ITensor *_tmp_c{ nullptr };
- ITensor *_c{ nullptr };
- unsigned int _Nsize{ 0 };
- unsigned int _Ksize{ 0 };
- bool _transpose_b{ false };
- BlockSizes _block_sizes{};
- INEGEMMWrapperKernel::Params _params{};
- Window _block_walker{};
- bool _b_is_pretransposed{ false };
- Tr _alpha{};
- Tr _beta{};
+ const ITensor *_transformed_b{ nullptr };
+ ITensor *_tmp_c{ nullptr };
+ ITensor *_c{ nullptr };
+ unsigned int _Nsize{ 0 };
+ unsigned int _Ksize{ 0 };
+ bool _transpose_b{ false };
+ BlockSizes _block_sizes{};
+ INEGEMMWrapperKernel::Params _params{};
+ Window _block_walker{};
+ bool _b_is_pretransposed{ false };
+ typename strategy::result_type _alpha{};
+ typename strategy::result_type _beta{};
};
} // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h b/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h
index e46c33018b..ba3223f66d 100644
--- a/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h
+++ b/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,14 +24,16 @@
#ifndef __ARM_COMPUTE_NEGEMMINTERLEAVEDPREPAREBWRAPPERKERNEL_H__
#define __ARM_COMPUTE_NEGEMMINTERLEAVEDPREPAREBWRAPPERKERNEL_H__
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/INEKernel.h"
#include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
#include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
namespace arm_compute
{
-class ITensor;
-
/** Unit of work for @ref NEGEMMInterleavedPrepareBWrapperKernel to process */
struct PrepareBWorkload
{
@@ -56,6 +58,84 @@ struct PrepareBWorkload
unsigned int _kmax; /**< Last value to process along the K dimension. */
};
+namespace detail
+{
+// Call the lambda function for each workload generated by the passed window.
+template <typename strategy, bool use_buffer_manager, typename Lambda>
+void for_each_element_in_window(const Window &window, const ITensor *b, ITensor *transformed_b, unsigned int N, unsigned int K, Lambda &&lambda)
+{
+ unsigned int wl_index = 0;
+ unsigned int num_buffers = 0, reshaped_block_size = 0;
+
+ if(use_buffer_manager)
+ {
+ num_buffers = transformed_b->info()->tensor_shape()[1];
+ reshaped_block_size = transformed_b->info()->strides_in_bytes().y();
+ }
+
+ unsigned int offset_transformed_b = transformed_b->info()->offset_first_element_in_bytes();
+ execute_window_loop(window, [&](const Coordinates & coordinates)
+ {
+ const unsigned int x0 = coordinates.x();
+ const unsigned int k0 = coordinates.y();
+ const unsigned int multi = coordinates.z();
+
+ const unsigned int offset_b = b->info()->offset_element_in_bytes(Coordinates(0, 0, multi));
+ const unsigned int xmax = std::min(x0 + window.x().step(), N);
+ const unsigned int kmax = std::min(k0 + window.y().step(), K);
+
+ /* Figure out the size of each block. */
+ unsigned int x_size = (xmax - x0);
+ unsigned int k_size = (kmax - k0);
+
+ /* Round sizes up as needed. */
+ x_size = ceil_to_multiple(x_size, strategy::out_width());
+ k_size = ceil_to_multiple(k_size, strategy::k_unroll());
+
+ lambda(PrepareBWorkload(offset_b, offset_transformed_b, x0, xmax, k0, kmax));
+
+ //Each workload represents one block:
+ if(use_buffer_manager)
+ {
+ // Rotate through the BufferManager's buffers:
+ wl_index++;
+ offset_transformed_b = (wl_index % num_buffers) * reshaped_block_size;
+ }
+ else
+ {
+ offset_transformed_b += (x_size * k_size * sizeof(typename strategy::operand_type));
+ }
+ });
+}
+
+// Calculate the size of transformed_b:
+template <typename strategy>
+unsigned int get_B_pretransposed_array_size(unsigned int N, unsigned int K, const BlockSizes &bs, unsigned int multis)
+{
+ // How many full blocks do N / K contain ?
+ size_t num_full_k = K / bs.k_block;
+ size_t num_full_x = N / bs.x_block;
+
+ ARM_COMPUTE_ERROR_ON(bs.x_block % strategy::out_width() != 0);
+ ARM_COMPUTE_ERROR_ON(bs.k_block % strategy::k_unroll() != 0);
+
+ size_t normal_x_size = bs.x_block;
+ size_t normal_k_size = bs.k_block;
+
+ // Round up the leftovers to be a multiple of the strategy processing size:
+ size_t left_over_x_size = ceil_to_multiple(N % bs.x_block, strategy::out_width());
+ size_t left_over_k_size = ceil_to_multiple(K % bs.k_block, strategy::k_unroll());
+
+ // Calculate the total size of the buffer:
+ size_t total = num_full_k * normal_k_size * (num_full_x * normal_x_size + left_over_x_size);
+ total += left_over_k_size * (left_over_x_size + num_full_x * normal_x_size);
+
+ total *= multis;
+
+ return total;
+}
+} // namespace detail
+
/** Common interface for the templated wrappers around the B reshape NEON assembly implementations */
class NEGEMMInterleavedPrepareBWrapperKernel : public INEKernel
{
@@ -93,7 +173,7 @@ public:
/** Equivalent to arm_gemm::GemmInterleaved's strategy::transforms::PrepareB() but using Compute Library types.
*/
-template <typename To, bool use_dot = false>
+template <typename strategy>
class NEGEMMInterleavedPrepareBWrapperKernelTemplate : public NEGEMMInterleavedPrepareBWrapperKernel
{
public:
@@ -105,13 +185,55 @@ public:
* @param[in] ci CPU information
* @param[in] params M, N, K sizes.
*/
- void configure(const ITensor *b, ITensor *transformed_b, bool transpose_b, const CPUInfo &ci, const INEGEMMWrapperKernel::Params &params);
+ void configure(const ITensor *b, ITensor *transformed_b, bool transpose_b, const CPUInfo &ci, const INEGEMMWrapperKernel::Params &params)
+ {
+ const unsigned int multis = b->info()->tensor_shape().z();
+ _Nsize = b->info()->tensor_shape().x();
+ _Ksize = b->info()->tensor_shape().y();
+ _b = b;
+ _transformed_b = transformed_b;
+ _transpose_b = transpose_b;
+
+ _block_sizes = calculate_block_sizes<strategy>(ci, params.M, params.N, params.K);
+
+ auto_init_if_empty(*transformed_b->info(), b->info()->clone()->set_tensor_shape(TensorShape{ detail::get_B_pretransposed_array_size<strategy>(_Nsize, _Ksize, _block_sizes, multis) }));
+
+ Window window;
+ window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_Nsize, _block_sizes.x_block), _block_sizes.x_block));
+ window.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_Ksize, _block_sizes.k_block), _block_sizes.k_block));
+ window.set(Window::DimZ, Window::Dimension(0, multis));
+
+ INEKernel::configure(window);
+ }
// Inherited methods overridden:
- void transform(const PrepareBWorkload &wl, const ThreadInfo &info) override;
- void create_workloads(std::vector<PrepareBWorkload> &workloads) override;
- void run(const Window &window, const ThreadInfo &info) override;
- BlockSizes block_sizes() const override;
+ void transform(const PrepareBWorkload &wl, const ThreadInfo &info) override
+ {
+ strategy strat(info.cpu_info);
+ strat.transforms.PrepareB(reinterpret_cast<typename strategy::operand_type *>(_transformed_b->buffer() + wl._offset_transformed_b),
+ reinterpret_cast<typename strategy::operand_type *>(_b->buffer() + wl._offset_b),
+ _b->info()->strides_in_bytes().y() / sizeof(typename strategy::operand_type),
+ wl._x0, wl._xmax, wl._k0, wl._kmax, _transpose_b);
+ }
+ void create_workloads(std::vector<PrepareBWorkload> &workloads) override
+ {
+ detail::for_each_element_in_window<strategy, true>(window(), _b, _transformed_b, _Nsize, _Ksize, [&workloads](PrepareBWorkload && wl)
+ {
+ workloads.push_back(std::move(wl));
+ });
+ }
+ void run(const Window &window, const ThreadInfo &info) override
+ {
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(window, INEKernel::window());
+ detail::for_each_element_in_window<strategy, false>(window, _b, _transformed_b, _Nsize, _Ksize, [&](PrepareBWorkload && wl)
+ {
+ this->transform(wl, info);
+ });
+ }
+ BlockSizes block_sizes() const override
+ {
+ return _block_sizes;
+ }
private:
const ITensor *_b
diff --git a/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h b/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h
index b6831e3ca9..5d6cd02398 100644
--- a/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h
+++ b/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,8 +25,13 @@
#define __ARM_COMPUTE_NEGEMMINTERLEAVEDTRANSFORMAWRAPPER_H__
#include "arm_compute/core/CPP/CPPTypes.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/WindowIterator.h"
namespace arm_compute
{
@@ -76,7 +81,7 @@ public:
};
/** Type specialisations of @ref NEGEMMInterleavedTransformAWrapper */
-template <typename To, bool use_dot = false>
+template <typename strategy>
class NEGEMMInterleavedTransformAWrapperTemplate : public NEGEMMInterleavedTransformAWrapper
{
public:
@@ -88,11 +93,67 @@ public:
* @param[in] block_walker Window representing the layout of the matrix's blocks
* @param[in] params M, N, K sizes.
*/
- void configure(const ITensor *a, ITensor *transformed_a, bool transpose_a, const Window &block_walker, const INEGEMMWrapperKernel::Params &params);
+ void configure(const ITensor *a, ITensor *transformed_a, bool transpose_a, const Window &block_walker, const INEGEMMWrapperKernel::Params &params)
+ {
+ _a = a;
+ _transformed_a = transformed_a;
+ _transpose_a = transpose_a;
+ _Ksize = params.K;
+ _Msize = params.M;
+ _k_multi_window = block_walker.shift_dimensions(1); // block_walker contains (M,K,Multi) --> shift by 1 to get rid of the "M" dimension
+ }
// Inherited methods overridden:
- void transform(const TransformAWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset, const Coordinates &end_offset) override;
- void create_workloads(std::vector<TransformAWorkload> &workloads) override;
+ void transform(const TransformAWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset, const Coordinates &end_offset) override
+ {
+ strategy strat(info.cpu_info);
+ TensorAccessor<typename strategy::operand_type> a(*_a);
+ TensorAccessor<typename strategy::operand_type> transformed_a(*_transformed_a);
+
+ if(_a->info()->data_layout() == DataLayout::NHWC)
+ {
+ // In the case of NHWC we want to interpret the output shape as 3D. Thus, the batch stride for A is
+ // the relevant multiple of the row stride.
+ const size_t nhwc_batch_stride = _a->info()->strides_in_bytes().y() * _Msize;
+ a.set_stride(2, nhwc_batch_stride);
+ }
+
+ unsigned int last_m = 0;
+ //TODO: Create a new iterate_1D( DimY);
+ int last_y = -1;
+ auto window_iterator = arm_compute::create_window_iterator(batch_window, start_offset, end_offset, [&](const Coordinates & id)
+ {
+ if(id.y() != last_y)
+ {
+ last_y = id.y();
+ unsigned int batch = id.y();
+ unsigned int first_m = id.x();
+
+ if(first_m >= last_m)
+ return;
+
+ strat.transforms.PrepareA(transformed_a(0, first_m, batch),
+ a(0, 0, batch, wl._multi),
+ a.stride(1), first_m, last_m, wl._k0, wl._kmax, _transpose_a);
+ }
+ });
+ auto on_new_row_size = [&](unsigned int start, unsigned int end)
+ {
+ last_m = std::min(end, _Msize);
+ };
+ window_iterator.iterate_2D(on_new_row_size);
+ }
+ void create_workloads(std::vector<TransformAWorkload> &workloads) override
+ {
+ execute_window_loop(_k_multi_window, [&](const Coordinates & id)
+ {
+ const unsigned int k0 = id.x();
+ const unsigned int multi = id.y();
+ const unsigned int kmax = std::min(k0 + _k_multi_window.x().step(), _Ksize);
+
+ workloads.push_back(TransformAWorkload(k0, kmax, multi));
+ });
+ }
private:
const ITensor *_a
diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp
index 162cbc5c46..26c1f3df89 100644
--- a/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,7 @@
#pragma once
#include <memory>
+#include <cstring>
#include "arm_gemm_local.hpp"
#include "gemm_common.hpp"
@@ -37,45 +38,57 @@ enum class GemmMethod
GEMV_PRETRANSPOSED,
GEMV_NATIVE_TRANSPOSED,
GEMM_NATIVE,
- GEMM_INTERLEAVED,
- GEMM_INTERLEAVED_FP16,
- GEMM_INTERLEAVED_DOT
+ GEMM_HYBRID,
+ GEMM_INTERLEAVED
+};
+
+
+struct KernelDescription
+{
+ GemmMethod method = GemmMethod::DEFAULT;
+ std::string name = "";
+
+ KernelDescription(GemmMethod m, std::string n) : method(m), name(n) { }
+ KernelDescription() { }
};
struct GemmConfig
{
- GemmMethod method = GemmMethod::DEFAULT;
+ GemmMethod method = GemmMethod::DEFAULT;
+ std::string filter = "";
unsigned int inner_block_size = 0;
unsigned int outer_block_size = 0;
GemmConfig(GemmMethod method) : method(method) { }
+ GemmConfig() { }
};
template<typename T>
struct GemmArgs
{
public:
- const CPUInfo *_ci;
- unsigned int _Msize;
- unsigned int _Nsize;
- unsigned int _Ksize;
- unsigned int _nbatches;
- unsigned int _nmulti;
- bool _trA;
- bool _trB;
- T _alpha;
- T _beta;
- int _maxthreads;
- bool _pretransposed_hint;
+ const CPUInfo *_ci;
+ unsigned int _Msize;
+ unsigned int _Nsize;
+ unsigned int _Ksize;
+ unsigned int _nbatches;
+ unsigned int _nmulti;
+ bool _trA;
+ bool _trB;
+ T _alpha;
+ T _beta;
+ int _maxthreads;
+ bool _pretransposed_hint;
+ const GemmConfig *_cfg;
GemmArgs(const CPUInfo *ci, const unsigned int M, const unsigned int N,
const unsigned int K, const unsigned int nbatches,
const unsigned int nmulti, const bool trA, const bool trB,
const T alpha, const T beta, const int maxthreads,
- const bool pretransposed_hint) :
- _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti),
- _trA(trA), _trB(trB), _alpha(alpha), _beta(beta), _maxthreads(maxthreads),
- _pretransposed_hint(pretransposed_hint)
+ const bool pretransposed_hint, const GemmConfig *cfg=nullptr ) :
+ _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti),
+ _trA(trA), _trB(trB), _alpha(alpha), _beta(beta), _maxthreads(maxthreads),
+ _pretransposed_hint(pretransposed_hint), _cfg(cfg)
{
}
};
@@ -90,7 +103,7 @@ using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret> >;
* provided parameters be provided using the supplied method? */
template<typename Top, typename Tret>
-bool method_is_compatible(GemmMethod method, GemmArgs<Tret> &args);
+bool method_is_compatible(GemmMethod method, const GemmArgs<Tret> &args);
template<typename Top, typename Tret>
bool method_is_compatible(GemmMethod method, const CPUInfo &ci,
@@ -107,14 +120,14 @@ bool method_is_compatible(GemmMethod method, const CPUInfo &ci,
/* get_gemm_method(): Given the templated types and provided parameters,
* which is the preferred method to implement this GEMM? */
template<typename Top, typename Tret>
-GemmMethod get_gemm_method(GemmArgs<Tret> &args);
+KernelDescription get_gemm_method(const GemmArgs<Tret> &args);
template<typename Top, typename Tret>
-GemmMethod get_gemm_method(const CPUInfo &ci,
- const unsigned int M, const unsigned int N, const unsigned int K,
- const unsigned int nbatches, const unsigned int nmulti,
- const bool trA, const bool trB, const Tret alpha, const Tret beta,
- const int maxthreads, const bool pretransposed_hint)
+KernelDescription get_gemm_method(const CPUInfo &ci,
+ const unsigned int M, const unsigned int N, const unsigned int K,
+ const unsigned int nbatches, const unsigned int nmulti,
+ const bool trA, const bool trB, const Tret alpha, const Tret beta,
+ const int maxthreads, const bool pretransposed_hint)
{
GemmArgs<Tret> args(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint);
@@ -122,7 +135,7 @@ GemmMethod get_gemm_method(const CPUInfo &ci,
}
template<typename Top, typename Tret>
-UniqueGemmCommon<Top, Tret> gemm(GemmArgs<Tret> &args, GemmConfig *cfg);
+UniqueGemmCommon<Top, Tret> gemm(const GemmArgs<Tret> &args);
/** Request an object to process a GEMM.
*
@@ -147,9 +160,24 @@ UniqueGemmCommon<Top, Tret> gemm(const CPUInfo &ci,
const bool trA, const bool trB, const Tret alpha, const Tret beta,
const int maxthreads, const bool pretransposed_hint, GemmConfig *cfg=nullptr)
{
+ GemmArgs<Tret> args(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint, cfg);
+
+ return gemm<Top, Tret>(args);
+}
+
+template<typename Top, typename Tret>
+std::vector<std::string> get_compatible_kernels(const GemmArgs<Tret> &args);
+
+template<typename Top, typename Tret>
+std::vector<std::string> get_compatible_kernels(const CPUInfo &ci,
+ const unsigned int M, const unsigned int N, const unsigned int K,
+ const unsigned int nbatches, const unsigned int nmulti,
+ const bool trA, const bool trB, const Tret alpha, const Tret beta,
+ const int maxthreads, const bool pretransposed_hint)
+{
GemmArgs<Tret> args(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint);
- return gemm<Top, Tret>(args, cfg);
+ return get_compatible_kernels<Top, Tret>(args);
}
} // namespace arm_gemm
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
index b43d6eaca6..7b4f0149e3 100644
--- a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,11 +88,11 @@ public:
* This has an empty default implementation, as GEMMs which don't care
* about thread count can safely ignore this.
*/
- virtual void set_nthreads(int nthreads) { };
+ virtual void set_nthreads(int) { };
/* Actually do the work. Provide a threadid to index any per-thread
* buffers, and a start/end range to indicate which work to do. */
- virtual void execute(unsigned int start, unsigned int end, int threadid) = 0;
+ virtual void execute(unsigned int, unsigned int, int) = 0;
/*** Working space interface (optional) ***/
/* Total number of bytes of temporary working space needed. If zero, it's not necessary to call set_working_space(). */
@@ -108,9 +108,10 @@ public:
/* Total number of bytes of space needed for pretransposed arrays. */
virtual size_t get_B_pretransposed_array_size() const { return 0; }
/* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
- virtual void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) { };
+ /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
+ virtual void pretranspose_B_array(void *, const To *, const int, const int) { };
/* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */
- virtual void set_pretransposed_B_data(void *buffer) { }
+ virtual void set_pretransposed_B_data(void *) { }
// Destructor
virtual ~GemmCommon() { }
diff --git a/arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h b/arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h
index 26236ffb35..3ccfbc512b 100644
--- a/arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h
+++ b/arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,9 @@
#include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
#include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
+#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
+#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
+#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/IScheduler.h"
@@ -36,13 +39,8 @@
namespace arm_compute
{
+// Forward declarations
class ITensor;
-class NEGEMMInterleavedPrepareBWrapperKernel;
-class PrepareBWorkload;
-class TransformAWorkload;
-class MatrixMultiplyWorkload;
-class NEGEMMInterleavedTransformAWrapper;
-class NEGEMMInterleavedMatrixMultiplyWrapper;
/** Buffer manager used when reshaping B on the fly
*
@@ -97,6 +95,7 @@ class NEGEMMInterleavedWrapper : public IFunction
{
public:
NEGEMMInterleavedWrapper(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ ~NEGEMMInterleavedWrapper() = default;
NEGEMMInterleavedWrapper(const NEGEMMInterleavedWrapper &) = delete;
NEGEMMInterleavedWrapper &operator=(const NEGEMMInterleavedWrapper &) = delete;
@@ -111,9 +110,8 @@ public:
* @param[in] alpha Scalar multiplier to apply to AB matrix product.
* @param[in] beta Scalar multiplier to apply to input C matrix before adding product.
* @param[in] pretranspose_b If true, pretranspose B once during the prepare() stage instead of on the fly every time.
- * @param[in] use_dot (Optional) If the input's type is U8/S8/QASYMM8 then use the dot product flavour or the matrix multiply routine. (Must be supported by the hardware).
*/
- void configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, bool pretranspose_b, bool use_dot = false);
+ void configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, bool pretranspose_b);
// Inherited methods overridden:
void run() override;
@@ -143,6 +141,5 @@ private:
std::vector<IScheduler::Workload> _workloads{};
std::string _tag{};
};
-
} // namespace arm_compute
#endif /* __ARM_COMPUTE_NEGEMMINTERLEAVEDWRAPPER_H__ */
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
index 9194bdd4d4..1a90e96140 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -38,75 +38,51 @@
namespace arm_gemm {
-#ifdef __ARM_FEATURE_SVE
-class GemmImpl_gemm_fp16_interleaved_fp16 : public GemmImplementation<__fp16, __fp16> {
-public:
-
- UniqueGemmCommon<__fp16, __fp16> instantiate(const GemmArgs<__fp16> &args) override {
- return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<interleaved_fp16_mla_3VLx8, __fp16, __fp16>(args));
- }
-
- GemmImpl_gemm_fp16_interleaved_fp16() : GemmImplementation<__fp16, __fp16>(GemmMethod::GEMM_INTERLEAVED_FP16) { }
-};
-
-#elif defined(__aarch64__)
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS)
-class GemmImpl_gemm_fp16_interleaved_fp16 : public GemmImplementation<__fp16, __fp16> {
-public:
+static const GemmImplementation<__fp16, __fp16> gemm_fp16_methods[] = {
+#if defined(__ARM_FEATURE_SVE)
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "interleaved_fp16_mla_3VLx8",
+ [](const GemmArgs<__fp16> &args) { return (args._Ksize > 4); },
+ [](const GemmArgs<__fp16> &args) { return true; },
+ [](const GemmArgs<__fp16> &args) { return new GemmInterleaved<interleaved_fp16_mla_3VLx8, __fp16, __fp16>(args); }
+},
+#endif
+#if defined(__aarch64__) && (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS))
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "hgemm_24x8",
+ [](const GemmArgs<__fp16> &args) {
#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- bool is_supported(const GemmArgs<__fp16> &args) override {
return args._ci->has_fp16();
- }
-#endif
-
- UniqueGemmCommon<__fp16, __fp16> instantiate(const GemmArgs<__fp16> &args) override {
- return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<hgemm_24x8, __fp16, __fp16>(args));
- }
-
- GemmImpl_gemm_fp16_interleaved_fp16() : GemmImplementation<__fp16, __fp16>(GemmMethod::GEMM_INTERLEAVED_FP16) { }
-};
-#endif
-
-#endif // __aarch64__
-
-class GemmImpl_gemm_fp16_interleaved : public GemmImplementation<__fp16, __fp16> {
-public:
- UniqueGemmCommon<__fp16, __fp16> instantiate(const GemmArgs<__fp16> &args) override {
-#ifdef __aarch64__
- return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<sgemm_12x8, __fp16, __fp16>(args));
-#elif defined(__arm__)
- return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<sgemm_8x6, __fp16, __fp16>(args));
#else
-# error Unknown Architecture
+ return true;
#endif
- }
-
- GemmImpl_gemm_fp16_interleaved() : GemmImplementation<__fp16, __fp16>(GemmMethod::GEMM_INTERLEAVED) { }
-};
-
-#if defined(__aarch64__) && (defined(__ARM_FEATURE_VECTOR_ARITHMETIC) || defined(FP16_KERNELS) || defined(__ARM_FEATURE_SVE))
-static GemmImpl_gemm_fp16_interleaved_fp16 gemm_fp16_interleaved_fp16_impl{};
-#endif
-static GemmImpl_gemm_fp16_interleaved gemm_fp16_interleaved_impl{};
-
-static std::vector<GemmImplementation<__fp16, __fp16> *> gemm_fp16_methods = {
-#if defined(__aarch64__) && (defined(__ARM_FEATURE_VECTOR_ARITHMETIC) || defined(FP16_KERNELS) || defined(__ARM_FEATURE_SVE))
- &gemm_fp16_interleaved_fp16_impl,
+ },
+ [](const GemmArgs<__fp16> &args) { return true; },
+ [](const GemmArgs<__fp16> &args) { return new GemmInterleaved<hgemm_24x8, __fp16, __fp16>(args); }
+},
#endif
- &gemm_fp16_interleaved_impl
+{
+ GemmMethod::DEFAULT,
+ "",
+ nullptr,
+ nullptr,
+ nullptr,
+}
};
template<>
-std::vector<GemmImplementation<__fp16, __fp16> *> &gemm_implementation_list<__fp16, __fp16>() {
+const GemmImplementation<__fp16, __fp16> *gemm_implementation_list<__fp16, __fp16>() {
return gemm_fp16_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<__fp16, __fp16> gemm<__fp16, __fp16>(GemmArgs<__fp16> &args, GemmConfig *cfg);
-template GemmMethod get_gemm_method<__fp16, __fp16>(GemmArgs<__fp16> &args);
-template bool method_is_compatible<__fp16, __fp16>(GemmMethod method, GemmArgs<__fp16> &args);
+template UniqueGemmCommon<__fp16, __fp16> gemm<__fp16, __fp16>(const GemmArgs<__fp16> &args);
+template KernelDescription get_gemm_method<__fp16, __fp16>(const GemmArgs<__fp16> &args);
+template bool method_is_compatible<__fp16, __fp16>(GemmMethod method, const GemmArgs<__fp16> &args);
+template std::vector<std::string> get_compatible_kernels<__fp16, __fp16> (const GemmArgs<__fp16> &args);
} // namespace arm_gemm
-#endif // __ARM_FP16_ARGS
+#endif // __ARM_FP16_ARGS \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index 7d14971b70..8bc33ccb69 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_gemm.hpp"
#include "gemm_common.hpp"
+#include "gemm_hybrid.hpp"
#include "gemm_implementation.hpp"
#include "gemm_interleaved.hpp"
#include "gemm_native.hpp"
@@ -30,112 +31,140 @@
#include "gemv_native_transposed.hpp"
#include "gemv_pretransposed.hpp"
-#include "kernels/a64_sgemm_12x8.hpp"
#include "kernels/a32_sgemm_8x6.hpp"
-#include "kernels/a64_sgemv_trans.hpp"
-#include "kernels/a64_sgemv_pretransposed.hpp"
+#include "kernels/a64_sgemm_12x8.hpp"
#include "kernels/a64_sgemm_native_16x4.hpp"
+#include "kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp"
+#include "kernels/a64_sgemv_pretransposed.hpp"
+#include "kernels/a64_sgemv_trans.hpp"
+#include "kernels/sve_hybrid_fp32_mla_4VLx4.hpp"
#include "kernels/sve_interleaved_fp32_mla_3VLx8.hpp"
+#include "kernels/sve_native_fp32_mla_4VLx4.hpp"
+#include "kernels/sve_smallK_fp32_mla_1VLx4.hpp"
+#include "kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp"
namespace arm_gemm {
-#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
-// SGEMM implementations for AArch64 without SVE
-
-// Pretransposed GEMV
-class GemmImpl_sgemm_gemv_pretransposed : public GemmImplementation<float, float> {
-public:
- bool is_supported(const GemmArgs<float> &args) override {
- return (args._Msize==1 && args._alpha==1.0f && args._pretransposed_hint && args._nbatches==1);
- }
+static const GemmImplementation<float, float> gemm_fp32_methods[] =
+{
+{
+ GemmMethod::GEMV_BATCHED,
+ "gemv_batched",
+ [](const GemmArgs<float> &args) { return (args._Msize==1) && (args._nbatches>1); },
+ nullptr,
+ [](const GemmArgs<float> &args) { return new GemvBatched<float, float>(args); }
+},
+#ifdef __aarch64__
+{
+ GemmMethod::GEMV_PRETRANSPOSED,
+ "sgemv_pretransposed",
+ [](const GemmArgs<float> &args) { return (args._Msize==1 && args._alpha==1.0f && args._pretransposed_hint && args._nbatches==1); },
+ nullptr,
+ [](const GemmArgs<float> &args) { return new GemvPretransposed<sgemv_pretransposed, float, float>(args); }
+},
+{
+ GemmMethod::GEMV_NATIVE_TRANSPOSED,
+ "sgemv_trans",
+ [](const GemmArgs<float> &args) { return (args._Msize==1 && args._alpha==1.0f && !args._trA && !args._trB && args._nbatches==1); },
+ nullptr,
+ [](const GemmArgs<float> &args) { return new GemvNativeTransposed<sgemv_trans, float, float>(args); }
+},
- UniqueGemmCommon<float, float> instantiate(const GemmArgs<float> &args) override {
- return UniqueGemmCommon<float, float> (new GemvPretransposed<sgemv_pretransposed, float, float>(args._ci, args._Nsize, args._Ksize, args._nmulti, args._trB, args._beta));
- }
-
- GemmImpl_sgemm_gemv_pretransposed() : GemmImplementation<float, float>(GemmMethod::GEMV_PRETRANSPOSED) { }
-};
-
-// Native GEMV
-class GemmImpl_sgemm_gemv_native_transposed : public GemmImplementation<float, float> {
-public:
- bool is_supported(const GemmArgs<float> &args) override {
- return (args._Msize==1 && args._alpha==1.0f && !args._trA && !args._trB && args._nbatches==1);
- }
-
- UniqueGemmCommon<float, float> instantiate(const GemmArgs<float> &args) override {
- return UniqueGemmCommon<float, float> (new GemvNativeTransposed<sgemv_trans, float, float>(args._ci, args._Nsize, args._Ksize, args._nmulti, args._beta));
- }
-
- GemmImpl_sgemm_gemv_native_transposed() : GemmImplementation<float, float>(GemmMethod::GEMV_NATIVE_TRANSPOSED) { }
-};
-
-// Native GEMM
-class GemmImpl_sgemm_gemm_native : public GemmImplementation<float, float> {
-public:
- bool is_supported(const GemmArgs<float> &args) override {
- return (args._Ksize>4 && (args._Nsize % 16)==0 && args._alpha==1.0f && !args._trA && !args._trB);
- }
-
- bool is_recommended(const GemmArgs<float> &args) override {
- return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8));
- }
-
- UniqueGemmCommon<float, float> instantiate(const GemmArgs<float> &args) override {
- return UniqueGemmCommon<float, float> (new GemmNative<sgemm_native_16x4, float, float>(args._ci, args._Msize, args._Nsize, args._Ksize, args._nbatches, args._nmulti, args._beta));
- }
-
- GemmImpl_sgemm_gemm_native() : GemmImplementation<float, float>(GemmMethod::GEMM_NATIVE) { }
-};
-#endif // __aarch64__
-
-// Interleaved GEMM
-class GemmImpl_sgemm_gemm_interleaved : public GemmImplementation<float, float> {
-public:
- UniqueGemmCommon<float, float> instantiate(const GemmArgs<float> &args) override {
#ifdef __ARM_FEATURE_SVE
- return UniqueGemmCommon<float, float> (new GemmInterleaved<interleaved_fp32_mla_3VLx8, float, float>(args));
-#elif defined(__aarch64__)
- return UniqueGemmCommon<float, float> (new GemmInterleaved<sgemm_12x8, float, float>(args));
-#elif defined(__arm__)
- return UniqueGemmCommon<float, float> (new GemmInterleaved<sgemm_8x6, float, float>(args));
-#else
-# error Unknown Architecture.
-#endif
- }
-
- GemmImpl_sgemm_gemm_interleaved() : GemmImplementation<float, float>(GemmMethod::GEMM_INTERLEAVED) { }
-};
+ // SVE smallk / native / hybrid methods
+{
+ GemmMethod::GEMM_HYBRID,
+ "smallK_hybrid_fp32_mla_1VLx4",
+ [](const GemmArgs<float> &args) { return (args._Ksize <= 24) && !args._trA && args._alpha==1.0f && args._pretransposed_hint; },
+ nullptr,
+ [](const GemmArgs<float> &args) { return new GemmHybrid<smallK_hybrid_fp32_mla_1VLx4, float, float>(args); }
+},
+{
+ GemmMethod::GEMM_HYBRID,
+ "hybrid_fp32_mla_4VLx4",
+ [](const GemmArgs<float> &args) { return (args._Ksize >= 4) && (args._alpha == 1.0f) && !args._trA && args._pretransposed_hint; },
+ [](const GemmArgs<float> &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs<float> &args) { return new GemmHybrid<hybrid_fp32_mla_4VLx4, float, float>(args); }
+},
+{
+ GemmMethod::GEMM_NATIVE,
+ "smallK_fp32_mla_1VLx4",
+ [](const GemmArgs<float> &args) { return (args._Ksize <= 24) && !args._trA && !args._trB && args._alpha==1.0f; },
+ nullptr,
+ [](const GemmArgs<float> &args) { return new GemmNative<smallK_fp32_mla_1VLx4, float, float>(args); }
+},
+{
+ GemmMethod::GEMM_NATIVE,
+ "native_fp32_mla_4VLx4",
+ [](const GemmArgs<float> &args) { return (args._Ksize>4 && args._alpha==1.0f && !args._trA && !args._trB); },
+ [](const GemmArgs<float> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs<float> &args) { return new GemmNative<native_fp32_mla_4VLx4, float, float>(args); }
+},
+#endif // __ARM_FEATURE_SVE
+
+// NEON native / hybrid methods
+{
+ GemmMethod::GEMM_HYBRID,
+ "sgemm_nativeA_pretransposeB_16x4",
+ [](const GemmArgs<float> &args) { return (args._Ksize >= 4) && (args._alpha == 1.0f) && !args._trA && args._pretransposed_hint; },
+ [](const GemmArgs<float> &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs<float> &args) { return new GemmHybrid<sgemm_nativeA_pretransposeB_16x4, float, float>(args); }
+},
+{
+ GemmMethod::GEMM_NATIVE,
+ "sgemm_native_16x4",
+ [](const GemmArgs<float> &args) { return (args._Ksize>4 && (args._Nsize % 16)==0 && args._alpha==1.0f && !args._trA && !args._trB); },
+ [](const GemmArgs<float> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs<float> &args) { return new GemmNative<sgemm_native_16x4, float, float>(args); }
+},
-static GemmImpl_gemv_batched<float, float> gemv_batched_impl{};
-#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
-static GemmImpl_sgemm_gemv_pretransposed sgemm_gemv_pretransposed_impl{};
-static GemmImpl_sgemm_gemv_native_transposed sgemm_gemv_native_transposed_impl{};
-static GemmImpl_sgemm_gemm_native sgemm_gemm_native_impl{};
-#endif
-static GemmImpl_sgemm_gemm_interleaved sgemm_gemm_interleaved_impl{};
+#ifdef __ARM_FEATURE_SVE
+ {
+ GemmMethod::GEMM_INTERLEAVED,
+ "interleaved_fp32_mla_3VLx8",
+ [](const GemmArgs<float> &args) { return (args._Ksize>4); },
+ nullptr,
+ [](const GemmArgs<float> &args) { return new GemmInterleaved<interleaved_fp32_mla_3VLx8, float, float>(args); }
+},
+#endif // __ARM_FEATURE_SVE
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "sgemm_12x8",
+ nullptr,
+ nullptr,
+ [](const GemmArgs<float> &args) { return new GemmInterleaved<sgemm_12x8, float, float>(args); }
+},
+#endif // __aarch64__
-/* List of implementations (order matters) */
-static std::vector<GemmImplementation<float, float> *> SGemmMethods = {
- &gemv_batched_impl,
-#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
- &sgemm_gemv_pretransposed_impl,
- &sgemm_gemv_native_transposed_impl,
- &sgemm_gemm_native_impl,
-#endif
- &sgemm_gemm_interleaved_impl
+#ifdef __arm__
+ {
+ GemmMethod::GEMM_INTERLEAVED,
+ "sgemm_8x6",
+ nullptr,
+ nullptr,
+ [](const GemmArgs<float> &args) { return new GemmInterleaved<sgemm_8x6, float, float>(args); }
+},
+#endif // __arm__
+{
+ GemmMethod::DEFAULT,
+ "",
+ nullptr,
+ nullptr,
+ nullptr
+}
};
/* Templated function to return this list. */
template<>
-std::vector<GemmImplementation<float, float> *> &gemm_implementation_list<float, float>() {
- return SGemmMethods;
+const GemmImplementation<float, float> *gemm_implementation_list<float, float>() {
+ return gemm_fp32_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<float, float> gemm<float, float>(GemmArgs<float> &args, GemmConfig *cfg);
-template GemmMethod get_gemm_method<float, float>(GemmArgs<float> &args);
-template bool method_is_compatible<float, float>(GemmMethod method, GemmArgs<float> &args);
+template UniqueGemmCommon<float, float> gemm<float, float>(const GemmArgs<float> &args);
+template KernelDescription get_gemm_method<float, float>(const GemmArgs<float> &args);
+template bool method_is_compatible<float, float>(GemmMethod method, const GemmArgs<float> &args);
+template std::vector<std::string> get_compatible_kernels<float, float> (const GemmArgs<float> &args);
-} // namespace arm_gemm
+} // namespace arm_gemm \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
new file mode 100644
index 0000000000..09f03c6332
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <assert.h>
+
+#include <algorithm>
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_gemm {
+
+// Implementation of the GemmCommon abstract class.
+template<typename strategy, typename To, typename Tr>
+class GemmHybrid : public GemmCommon<To, Tr> {
+ typedef typename strategy::operand_type Toi;
+ typedef typename strategy::result_type Tri;
+
+ /* const properties set by constructor */
+ const CPUInfo * const _ci;
+
+ const unsigned int _Msize;
+ const unsigned int _Nsize;
+ const unsigned int _Ksize;
+
+ const unsigned int _nbatches;
+ const unsigned int _nmulti;
+
+ const bool _trB;
+
+ const Tr _beta;
+
+ /* Blocking info */
+ unsigned int _k_block=0;
+ unsigned int _x_block=0;
+ unsigned int _Mround=0;
+
+ /* Pretransposed buffer. */
+ const Toi *_B_transposed=nullptr;
+
+ unsigned int _B_per_multi = 0;
+
+ /* We will need to walk through the blocks of B in a few contexts, so
+ * factor that out. */
+ class blockwalker {
+ private:
+ /* Size loops, etc. based on our parent's configuration */
+ const GemmHybrid<strategy, To, Tr> &_parent;
+
+ /* K, X and multi parameters for current iteration. */
+ unsigned int _k0=0, _x0=0;
+
+ unsigned int _index=0;
+ bool _done=false;
+ bool _newkblock=true;
+
+ public:
+ blockwalker(const GemmHybrid<strategy, To, Tr> &parent) : _parent(parent) { }
+
+ unsigned int xmax() {
+ return std::min(_x0 + _parent._x_block, _parent._Nsize);
+ }
+
+ unsigned int kmax() {
+ return std::min(_k0 + _parent._k_block, _parent._Ksize);
+ }
+
+ /* Advance to the next block, return false at the end. */
+ bool advance(void) {
+ if (_done) {
+ return false;
+ }
+
+ _newkblock=false;
+ _x0 += _parent._x_block;
+ if (_x0 >= _parent._Nsize) {
+ _x0=0;
+ _k0 += _parent._k_block;
+ if (_k0 >= _parent._Ksize) {
+ _done=true;
+ return false;
+ }
+ _newkblock=true;
+ }
+ _index++;
+
+ return true;
+ }
+
+ unsigned int k0(void) { return _k0; }
+ unsigned int x0(void) { return _x0; }
+ unsigned int index(void) { return _index; }
+ bool done(void) { return _done; }
+ bool newkblock(void) { return _newkblock; }
+ };
+
+
+public:
+ GemmHybrid(GemmHybrid &) = delete;
+ GemmHybrid & operator= (GemmHybrid &) = delete;
+
+ /* Constructor */
+ GemmHybrid(const GemmArgs<Tr> &args)
+ : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), _nbatches(args._nbatches),
+ _nmulti(args._nmulti), _trB(args._trB), _beta(args._beta) {
+ const unsigned int L1_size = _ci->get_L1_cache_size();
+ const unsigned int L2_size = _ci->get_L2_cache_size();
+
+ _B_per_multi = (iceildiv(_Nsize, strategy::out_width()) * strategy::out_width()) *
+ (iceildiv(_Ksize, strategy::k_unroll()) * strategy::k_unroll());
+
+ // Work out blocking parameters, or override from config.
+
+ if (args._cfg && args._cfg->inner_block_size) {
+ _k_block = args._cfg->inner_block_size;
+ } else {
+ // k_block: Find out how much of the larger array can be loaded into half the cache.
+ // This should account for associative caches.
+ _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+
+ // Needs to be (at least a single) multiple of the K unroll level.
+ _k_block /= strategy::k_unroll();
+ _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
+
+ // Now tune to presented problem size; this is how many blocks we need.
+ int num_k_blocks = iceildiv(_Ksize, _k_block);
+
+ // So divide the space equally into that many blocks.
+ _k_block = iceildiv(_Ksize, num_k_blocks);
+
+ // And round UP to the K unroll level required.
+ _k_block = iceildiv(_k_block, strategy::k_unroll());
+ _k_block *= strategy::k_unroll();
+ }
+
+ if (args._cfg && args._cfg->outer_block_size) {
+ _x_block = args._cfg->outer_block_size;
+ } else {
+ // x_block: Work out how many rows (of length k_block) will fit in the L2
+ // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+ _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
+ (sizeof(Toi) * _k_block);
+
+ // Needs to be (at least a single) multiple of the kernel output width.
+ _x_block /= strategy::out_width();
+ _x_block = std::max(_x_block, 1U) * strategy::out_width();
+
+ // And tune to the presented problem size.
+ int num_x_blocks = iceildiv(_Nsize, _x_block);
+ _x_block = iceildiv(_Nsize, num_x_blocks);
+
+ _x_block = iceildiv(_x_block, strategy::out_width());
+ _x_block *= strategy::out_width();
+ }
+
+ // Work out the rounded size of M - needed for some buffers.
+ _Mround = iceildiv(_Msize, strategy::out_height());
+ _Mround *= strategy::out_height();
+ }
+
+ // Interface implementation - Compulsory functions
+
+ // Window size: Only the last thread should do a ragged block, so dole
+ // out work in units of out_height. Factor batches and multi into the
+ // window too.
+ unsigned int get_window_size() const override {
+ // _Mround is a multiple of out_height by definition.
+ return (_Mround / strategy::out_height()) * _nbatches * _nmulti;
+ }
+
+ // Execute
+ void execute(unsigned int start, unsigned int end, int threadid) override {
+#ifdef CYCLE_PROFILING
+ profiler prof;
+#endif
+ strategy strat(_ci);
+
+ /* Make sure we've been set up correctly. */
+ assert(_B_transposed);
+
+ const unsigned int window_per_batch = iceildiv(_Msize, strategy::out_height());
+ const unsigned int window_per_multi = window_per_batch * _nbatches;
+
+ const unsigned int first_multi = start / window_per_multi;
+ const unsigned int last_multi = end / window_per_multi;
+
+ const unsigned int first_batch = (start - (first_multi * window_per_multi)) / window_per_batch;
+ const unsigned int last_batch = (end - (last_multi * window_per_multi)) / window_per_batch;
+
+ const unsigned int first_row = ((start - (first_multi * window_per_multi)) % window_per_batch) * strategy::out_height();
+ const unsigned int last_row = ((end - (last_multi * window_per_multi)) % window_per_batch) * strategy::out_height();
+
+ static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
+ static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
+
+ for (unsigned int multi = first_multi; multi <= last_multi; multi++) {
+ const unsigned int batch_0 = (multi == first_multi) ? first_batch : 0;
+ const unsigned int batch_max = (multi == last_multi) ? last_batch : (_nbatches - 1);
+
+ const Toi *b_panel = _B_transposed + (multi * _B_per_multi);
+
+ for (blockwalker current(*this); !current.done(); current.advance()) {
+ int kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
+ kern_k *= strat.k_unroll();
+
+ int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());
+
+ for (unsigned int batch = batch_0; batch <= batch_max; batch++) {
+ const unsigned int m_start = ((multi == first_multi) && (batch == first_batch)) ? first_row : 0;
+ const unsigned int m_end = ((multi == last_multi) && (batch == last_batch) ) ? last_row : _Msize;
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * bblocks * strategy::out_width());
+#endif
+
+ strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + current.k0(), this->_lda,
+ b_panel,
+ this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + current.x0(), this->_ldc,
+ (current.k0() == 0) ? _beta : static_cast<Tr>(1),
+ (m_end - m_start), (current.xmax() - current.x0()), kern_k);
+ }
+
+ b_panel += (bblocks * strat.out_width() * kern_k);
+ }
+ }
+ }
+
+ // Interface implementation - pretransposed
+ bool B_is_pretransposed() const override {
+ return true;
+ }
+
+ bool B_pretranspose_required() const override {
+ return (_B_transposed==nullptr);
+ }
+
+ size_t get_B_pretransposed_array_size() const override {
+ return _B_per_multi * _nmulti * sizeof(Toi);
+ }
+
+ void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+ Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
+ _B_transposed = buffer;
+ strategy strat(_ci);
+
+ for (unsigned int multi=0; multi < _nmulti; multi++) {
+ blockwalker current(*this);
+
+ do {
+ /* Figure out the size of each block. */
+ size_t x_size = (current.xmax() - current.x0());
+ size_t k_size = (current.kmax() - current.k0());
+
+ /* Round sizes up as needed. */
+ x_size = iceildiv(x_size, strategy::out_width());
+ x_size *= strategy::out_width();
+
+ k_size = iceildiv(k_size, strategy::k_unroll());
+ k_size *= strategy::k_unroll();
+
+ strat.transforms.PrepareB(
+ buffer, B + (multi * B_multi_stride), ldb,
+ current.x0(), current.xmax(), current.k0(), current.kmax(), _trB);
+
+ buffer += (x_size * k_size);
+ } while (current.advance());
+ }
+ }
+
+ void set_pretransposed_B_data(void *in_buffer) override {
+ _B_transposed = reinterpret_cast<Toi *>(in_buffer);
+ }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index 6734e3cce0..bf80784b79 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,56 +22,53 @@
* SOFTWARE.
*/
-#include "gemv_batched.hpp"
+#include <arm_gemm.hpp>
-namespace arm_gemm {
-
-template<typename Top, typename Tret>
-class GemmImplementation {
-public:
- /* Is this implementation compatible with the args as provided? */
- virtual bool is_supported(const GemmArgs<Tret> &args) { return true; }
- /* Is this implementation "recommended" for these args (heuristic)? */
- virtual bool is_recommended(const GemmArgs<Tret> &args) { return true; }
- /* Instantiate this method please. */
- virtual UniqueGemmCommon<Top, Tret> instantiate(const GemmArgs<Tret> &args) = 0;
+#include <functional>
- /* Indicate the "GemmMethod" for use as a selector */
- const GemmMethod method;
-
- virtual ~GemmImplementation() { }
-
- GemmImplementation(GemmMethod method) : method(method) { }
-};
+namespace arm_gemm {
-/* "gemv_batched" implementation is type-agnostic, so template it here. */
template<typename Top, typename Tret>
-class GemmImpl_gemv_batched : public GemmImplementation<Top, Tret> {
-public:
- bool is_supported(const GemmArgs<Tret> &args) override {
- return (args._Msize==1 && args._nbatches > 1);
- }
-
- UniqueGemmCommon<Top, Tret> instantiate(const GemmArgs<Tret> &args) override {
- return UniqueGemmCommon<Top, Tret> (new GemvBatched<Top, Tret>(args));
- }
-
- GemmImpl_gemv_batched() : GemmImplementation<Top, Tret>(GemmMethod::GEMV_BATCHED) { }
+struct GemmImplementation {
+ const GemmMethod method;
+ const char * name;
+ std::function<bool(const GemmArgs<Tret> &)> is_supported;
+ std::function<bool(const GemmArgs<Tret> &)> is_recommended;
+ std::function<GemmCommon<Top, Tret> *(const GemmArgs<Tret> &)> instantiate;
};
/* "Master" function implemented for each valid combination of types.
* Returns a list of GEMM implementation descriptors for processing by the
- * other functions. */
+ * other functions, terminated by an implementation with
+ * method==GemmMethod::DEFAULT. */
template<typename Top, typename Tret>
-std::vector<GemmImplementation<Top, Tret> *> &gemm_implementation_list();
+const GemmImplementation<Top, Tret> *gemm_implementation_list();
+/*
+ * Select a GEMM implementation for the given arguments.
+ *
+ * The logic here returns the first method on the list which supports the
+ * requested problem parameters, matches the provided filters (method and/or
+ * name string match) and recommends itself.
+ *
+ * If there is no such method, it will return the first method which
+ * supports the requested parameters and passes the filters, regardless of
+ * recommendation.
+ *
+ * If no method supports the requested parameters and passes the filters,
+ * this function returns false and doesn't touch the provided pointer
+ * reference.
+ */
template<typename Top, typename Tret>
-GemmImplementation<Top, Tret> *find_implementation(GemmArgs<Tret> &args, GemmConfig *cfg) {
+bool find_implementation(const GemmArgs<Tret> &args, const GemmImplementation<Top, Tret> * &impl) {
auto gemms = gemm_implementation_list<Top, Tret>();
+ const GemmConfig *cfg = args._cfg;
- for(auto &&i : gemms) {
+ const GemmImplementation<Top, Tret> *saved_impl = nullptr;
+
+ for (auto i = gemms; i->method != GemmMethod::DEFAULT; i++) {
/* Skip if this implementation doesn't support these args. */
- if (!i->is_supported(args)) {
+ if (i->is_supported != nullptr && !i->is_supported(args)) {
continue;
}
@@ -80,52 +77,92 @@ GemmImplementation<Top, Tret> *find_implementation(GemmArgs<Tret> &args, GemmCon
continue;
}
- /* If no specific method is requested, check that this method recommends itself. */
- if ((!cfg || cfg->method == GemmMethod::DEFAULT) && !i->is_recommended(args)) {
+ /* Skip if a filter is to be applied and it doesn't match. */
+ if (cfg && cfg->filter != "" && !strstr(i->name, cfg->filter.c_str())) {
+ continue;
+ }
+
+ /* At this point, if we don't have a saved implementation, save this
+ * one. This is so that we always return something if a filter
+ * matches, even if it doesn't recommend itself.
+ */
+ if (saved_impl == nullptr) {
+ saved_impl=i;
+ }
+
+ /* Check that this method recommends itself. */
+ if (i->is_recommended != nullptr && !i->is_recommended(args)) {
+ continue;
+ }
+
+ impl=i;
+
+ return true;
+ }
+
+ /* We didn't find an option matching the filters that recommended
+ * itself. But if we found something earlier that matched the filters
+ * but wasn't recommended, return it here. */
+ if (saved_impl != nullptr) {
+ impl = saved_impl;
+ return true;
+ }
+
+ return false;
+}
+
+template<typename Top, typename Tret>
+std::vector<std::string> get_compatible_kernels(const GemmArgs<Tret> &args) {
+ std::vector<std::string> res;
+
+ auto gemms = gemm_implementation_list<Top, Tret>();
+
+ for (auto i = gemms; i->method != GemmMethod::DEFAULT; i++) {
+ /* Check that this implementation supports the presented problem. */
+ if (i->is_supported != nullptr && !i->is_supported(args)) {
continue;
}
- return i;
+ res.push_back(i->name);
}
- return nullptr;
+ return res;
}
template<typename Top, typename Tret>
-UniqueGemmCommon<Top, Tret> gemm(GemmArgs<Tret> &args, GemmConfig *cfg) {
- auto impl = find_implementation<Top, Tret>(args, cfg);
+UniqueGemmCommon<Top, Tret> gemm(const GemmArgs<Tret> &args) {
+ const GemmImplementation<Top, Tret> *impl;
- if (impl) {
- return impl->instantiate(args);
+ if (find_implementation<Top, Tret>(args, impl)) {
+ return UniqueGemmCommon<Top, Tret>(impl->instantiate(args));
}
return UniqueGemmCommon<Top, Tret>(nullptr);
}
template<typename Top, typename Tret>
-GemmMethod get_gemm_method(GemmArgs<Tret> &args) {
- auto impl = find_implementation<Top, Tret>(args, nullptr);
+KernelDescription get_gemm_method(const GemmArgs<Tret> &args) {
+ const GemmImplementation<Top, Tret> *impl;
- if (impl) {
- return impl->method;
+ if (find_implementation<Top, Tret>(args, impl)) {
+ return KernelDescription(impl->method, impl->name);
}
/* This shouldn't happen - there should always be at least one valid implementation. */
- return GemmMethod::DEFAULT;
+ return KernelDescription();
}
template<typename Top, typename Tret>
-bool method_is_compatible(GemmMethod method, GemmArgs<Tret> &args) {
+bool method_is_compatible(GemmMethod method, const GemmArgs<Tret> &args) {
/* Determine if the method is valid by attempting to obtain an implementation specifying this method. */
- GemmConfig cfg(method);
+ GemmConfig cfg(method);
+ GemmArgs<Tret> myargs = args;
- auto impl = find_implementation<Top, Tret>(args, &cfg);
+ myargs._cfg = &cfg;
- if (impl) {
- return true;
- }
+ const GemmImplementation<Top, Tret> *impl;
- return false;
+ return find_implementation<Top, Tret>(myargs, impl);
}
-} // namespace arm_gemm
+} // namespace arm_gemm \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index ad171a7f9a..b4503dd6a2 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,30 +32,33 @@
namespace arm_gemm {
-class GemmImpl_gemm_s16_interleaved : public GemmImplementation<int16_t, int32_t> {
-public:
- UniqueGemmCommon<int16_t, int32_t> instantiate(const GemmArgs<int32_t> &args) override {
- return UniqueGemmCommon<int16_t, int32_t>(new GemmInterleaved<gemm_s16_12x8, int16_t, int32_t>(args));
- }
-
- GemmImpl_gemm_s16_interleaved() : GemmImplementation<int16_t, int32_t>(GemmMethod::GEMM_INTERLEAVED) { }
-};
-
-static GemmImpl_gemm_s16_interleaved gemm_s16_interleaved_impl{};
-
-static std::vector<GemmImplementation<int16_t, int32_t> *> gemm_s16_methods = {
- &gemm_s16_interleaved_impl
+static const GemmImplementation<int16_t, int32_t> gemm_s16_methods[] = {
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "gemm_s16_12x8",
+ nullptr,
+ nullptr,
+ [](const GemmArgs<int32_t> &args) { return new GemmInterleaved<gemm_s16_12x8, int16_t, int32_t>(args); }
+},
+{
+ GemmMethod::DEFAULT,
+ "",
+ nullptr,
+ nullptr,
+ nullptr
+}
};
template<>
-std::vector<GemmImplementation<int16_t, int32_t> *> &gemm_implementation_list<int16_t, int32_t>() {
+const GemmImplementation<int16_t, int32_t> *gemm_implementation_list<int16_t, int32_t>() {
return gemm_s16_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<int16_t, int32_t> gemm<int16_t, int32_t>(GemmArgs<int32_t> &args, GemmConfig *cfg);
-template GemmMethod get_gemm_method<int16_t, int32_t>(GemmArgs<int32_t> &args);
-template bool method_is_compatible<int16_t, int32_t>(GemmMethod method, GemmArgs<int32_t> &args);
+template UniqueGemmCommon<int16_t, int32_t> gemm<int16_t, int32_t>(const GemmArgs<int32_t> &args);
+template KernelDescription get_gemm_method<int16_t, int32_t>(const GemmArgs<int32_t> &args);
+template bool method_is_compatible<int16_t, int32_t>(GemmMethod method, const GemmArgs<int32_t> &args);
+template std::vector<std::string> get_compatible_kernels<int16_t, int32_t> (const GemmArgs<int32_t> &args);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index 627d8abdb9..34dc8bc341 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,67 +27,67 @@
#include "gemm_common.hpp"
#include "gemm_implementation.hpp"
#include "gemm_interleaved.hpp"
+#include "gemm_native.hpp"
#include "kernels/a64_gemm_s16_12x8.hpp"
#include "kernels/a64_gemm_s8_12x8.hpp"
#include "kernels/a64_gemm_s8_4x4.hpp"
#include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp"
+#include "kernels/sve_native_s8s32_dot_4VLx4.hpp"
namespace arm_gemm {
+static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
#ifdef __ARM_FEATURE_SVE
-class GemmImpl_gemm_s8_interleaved_dot : public GemmImplementation<int8_t, int32_t> {
-public:
- UniqueGemmCommon<int8_t, int32_t> instantiate(const GemmArgs<int32_t> &args) override {
- return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<interleaved_s8s32_dot_3VLx8, int8_t, int32_t>(args));
- }
-
- GemmImpl_gemm_s8_interleaved_dot() : GemmImplementation<int8_t, int32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
-};
-#else
-
-class GemmImpl_gemm_s8_interleaved_dot : public GemmImplementation<int8_t, int32_t> {
-public:
- bool is_supported(const GemmArgs<int32_t> &args) override {
- return args._ci->has_dotprod();
- }
-
- UniqueGemmCommon<int8_t, int32_t> instantiate(const GemmArgs<int32_t> &args) override {
- return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<gemm_s8_12x8, int8_t, int32_t>(args));
- }
-
- GemmImpl_gemm_s8_interleaved_dot() : GemmImplementation<int8_t, int32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
-};
-
+{
+ GemmMethod::GEMM_NATIVE,
+ "native_s8s32_dot_4VLx4",
+ [](const GemmArgs<int32_t> &args) { return (args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB); },
+ [](const GemmArgs<int32_t> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
+ [](const GemmArgs<int32_t> &args) { return new GemmNative<native_s8s32_dot_4VLx4, int8_t, int32_t>(args); }
+},
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "interleaved_s8s32_dot_3VLx8",
+ [](const GemmArgs<int32_t> &args) { return (args._Ksize>4); },
+ nullptr,
+ [](const GemmArgs<int32_t> &args) { return new GemmInterleaved<interleaved_s8s32_dot_3VLx8, int8_t, int32_t>(args); }
+},
#endif
-
-class GemmImpl_gemm_s8_interleaved : public GemmImplementation<int8_t, int32_t> {
-public:
- UniqueGemmCommon<int8_t, int32_t> instantiate(const GemmArgs<int32_t> &args) override {
- return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<gemm_s8_4x4, int8_t, int32_t>(args));
- }
-
- GemmImpl_gemm_s8_interleaved() : GemmImplementation<int8_t, int32_t>(GemmMethod::GEMM_INTERLEAVED) { }
-};
-
-static GemmImpl_gemm_s8_interleaved_dot gemm_s8_interleaved_dot_impl{};
-static GemmImpl_gemm_s8_interleaved gemm_s8_interleaved_impl{};
-
-static std::vector<GemmImplementation<int8_t, int32_t> *> gemm_s8_methods = {
- &gemm_s8_interleaved_dot_impl,
- &gemm_s8_interleaved_impl
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "gemm_s8_12x8",
+ [](const GemmArgs<int32_t> &args) { return args._ci->has_dotprod(); },
+ nullptr,
+ [](const GemmArgs<int32_t> &args) { return new GemmInterleaved<gemm_s8_12x8, int8_t, int32_t>(args); }
+},
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "gemm_s8_4x4",
+ nullptr,
+ nullptr,
+ [](const GemmArgs<int32_t> &args) { return new GemmInterleaved<gemm_s8_4x4, int8_t, int32_t>(args); }
+},
+{
+ GemmMethod::DEFAULT,
+ "",
+ nullptr,
+ nullptr,
+ nullptr
+}
};
template<>
-std::vector<GemmImplementation<int8_t, int32_t> *> &gemm_implementation_list<int8_t, int32_t>() {
+const GemmImplementation<int8_t, int32_t> *gemm_implementation_list<int8_t, int32_t>() {
return gemm_s8_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<int8_t, int32_t> gemm<int8_t, int32_t>(GemmArgs<int32_t> &args, GemmConfig *cfg);
-template GemmMethod get_gemm_method<int8_t, int32_t>(GemmArgs<int32_t> &args);
-template bool method_is_compatible<int8_t, int32_t>(GemmMethod method, GemmArgs<int32_t> &args);
+template UniqueGemmCommon<int8_t, int32_t> gemm<int8_t, int32_t>(const GemmArgs<int32_t> &args);
+template KernelDescription get_gemm_method<int8_t, int32_t>(const GemmArgs<int32_t> &args);
+template bool method_is_compatible<int8_t, int32_t>(GemmMethod method, const GemmArgs<int32_t> &args);
+template std::vector<std::string> get_compatible_kernels<int8_t, int32_t> (const GemmArgs<int32_t> &args);
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // __aarch64__ \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 0e58a4d01f..436438f351 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -318,50 +318,57 @@ public:
/* Constructor */
GemmInterleaved(const GemmArgs<Tr> &args)
- : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
- _nbatches(args._nbatches), _nmulti(args._nmulti), _trA(args._trA), _trB(args._trB),
- _alpha(args._alpha), _beta(args._beta), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
- _pretransposed(args._pretransposed_hint) {
+ : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
+ _nbatches(args._nbatches), _nmulti(args._nmulti), _trA(args._trA), _trB(args._trB),
+ _alpha(args._alpha), _beta(args._beta), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
+ _pretransposed(args._pretransposed_hint) {
const unsigned int L1_size = _ci->get_L1_cache_size();
const unsigned int L2_size = _ci->get_L2_cache_size();
assert(_maxthreads > 0);
- // Work out blocking parameters
-
- // k_block: Find out how much of the larger array can be loaded into half the cache.
- // This should account for associative caches.
- _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+ // Work out blocking parameters, or override from provided GemmConfig
+ if (args._cfg && args._cfg->inner_block_size) {
+ _k_block = args._cfg->inner_block_size;
+ } else {
+ // k_block: Find out how much of the larger array can be loaded into half the cache.
+ // This should account for associative caches.
+ _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
- // Needs to be (at least a single) multiple of the K unroll level.
- _k_block /= strategy::k_unroll();
- _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
+ // Needs to be (at least a single) multiple of the K unroll level.
+ _k_block /= strategy::k_unroll();
+ _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
- // Now tune to presented problem size; this is how many blocks we need.
- int num_k_blocks = iceildiv(_Ksize, _k_block);
+ // Now tune to presented problem size; this is how many blocks we need.
+ int num_k_blocks = iceildiv(_Ksize, _k_block);
- // So divide the space equally into that many blocks.
- _k_block = iceildiv(_Ksize, num_k_blocks);
+ // So divide the space equally into that many blocks.
+ _k_block = iceildiv(_Ksize, num_k_blocks);
- // And round UP to the K unroll level required.
- _k_block = iceildiv(_k_block, strategy::k_unroll());
- _k_block *= strategy::k_unroll();
+ // And round UP to the K unroll level required.
+ _k_block = iceildiv(_k_block, strategy::k_unroll());
+ _k_block *= strategy::k_unroll();
+ }
- // x_block: Work out how many rows (of length k_block) will fit in the L2
- // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
- _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
- (sizeof(Toi) * _k_block);
+ if (args._cfg && args._cfg->outer_block_size) {
+ _x_block = args._cfg->outer_block_size;
+ } else {
+ // x_block: Work out how many rows (of length k_block) will fit in the L2
+ // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+ _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
+ (sizeof(Toi) * _k_block);
- // Needs to be (at least a single) multiple of the kernel output width.
- _x_block /= strategy::out_width();
- _x_block = std::max(_x_block, 1U) * strategy::out_width();
+ // Needs to be (at least a single) multiple of the kernel output width.
+ _x_block /= strategy::out_width();
+ _x_block = std::max(_x_block, 1U) * strategy::out_width();
- // And tune to the presented problem size.
- int num_x_blocks = iceildiv(_Nsize, _x_block);
- _x_block = iceildiv(_Nsize, num_x_blocks);
+ // And tune to the presented problem size.
+ int num_x_blocks = iceildiv(_Nsize, _x_block);
+ _x_block = iceildiv(_Nsize, num_x_blocks);
- _x_block = iceildiv(_x_block, strategy::out_width());
- _x_block *= strategy::out_width();
+ _x_block = iceildiv(_x_block, strategy::out_width());
+ _x_block *= strategy::out_width();
+ }
// Work out the rounded size of M - needed for some buffers.
_Mround = iceildiv(_Msize, strategy::out_height());
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
index baa1316745..579533418d 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -74,11 +74,11 @@ public:
GemmNative(GemmNative &) = delete;
GemmNative & operator= (GemmNative &) = delete;
- GemmNative(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const unsigned int nbatches, const unsigned int nmultis, const Tr beta) :
- _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmultis(nmultis), _beta(beta), _ci(ci) {
+ GemmNative(const GemmArgs<Tr> &args)
+ : _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), _nbatches(args._nbatches), _nmultis(args._nmulti), _beta(args._beta), _ci(args._ci) {
/* For now don't do any blocking. TODO: figure out if we should. */
- k_block = K;
- n_block = N;
+ k_block = _Ksize;
+ n_block = _Nsize;
}
// Window is amount per multi multiplied by total number of multis.
@@ -105,8 +105,13 @@ public:
unsigned int y0 = batch_pos * strategy::out_height();
- for (unsigned int pos=start; pos<end; pos++) {
- const unsigned int ymax = std::min(y0 + strategy::out_height(), _Msize);
+ for (unsigned int l=end-start; l>0; ) {
+ // Do work from here to the end of the current batch/multi
+ const unsigned int ymax = std::min(y0 + (l * strategy::out_height()), _Msize);
+
+ // Work out how many units this is and subtract from loop counter.
+ l -= ((ymax - y0) + (strategy::out_height() - 1)) / strategy::out_height();
+
#ifdef CYCLE_PROFILING
auto p = prof.ScopedProfiler(PROFILE_KERNEL, (ymax-y0) * _Nsize * _Ksize);
#endif
@@ -117,7 +122,7 @@ public:
_beta, (ymax-y0), _Nsize, _Ksize);
/* Advance to next item */
- y0 += strategy::out_height();
+ y0 = ymax;
/* Check for batch/multi overflow */
if (y0 >= _Msize) {
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
index feea4829d1..6bcbca9e8b 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,31 +32,34 @@
namespace arm_gemm {
-class GemmImpl_gemm_u16_interleaved : public GemmImplementation<uint16_t, uint32_t> {
-public:
- UniqueGemmCommon<uint16_t, uint32_t> instantiate(const GemmArgs<uint32_t> &args) override {
- return UniqueGemmCommon<uint16_t, uint32_t>(new GemmInterleaved<gemm_u16_12x8, uint16_t, uint32_t>(args));
- }
-
- GemmImpl_gemm_u16_interleaved() : GemmImplementation<uint16_t, uint32_t>(GemmMethod::GEMM_INTERLEAVED) { }
-};
-
-static GemmImpl_gemm_u16_interleaved gemm_u16_interleaved_impl{};
-
-static std::vector<GemmImplementation<uint16_t, uint32_t> *> gemm_u16_methods = {
- &gemm_u16_interleaved_impl
+static const GemmImplementation<uint16_t, uint32_t> gemm_u16_methods[] = {
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "gemm_u16_12x8",
+ nullptr,
+ nullptr,
+ [](const GemmArgs<uint32_t> &args) { return new GemmInterleaved<gemm_u16_12x8, uint16_t, uint32_t>(args); }
+},
+{
+ GemmMethod::DEFAULT,
+ "",
+ nullptr,
+ nullptr,
+ nullptr
+}
};
template<>
-std::vector<GemmImplementation<uint16_t, uint32_t> *> &gemm_implementation_list<uint16_t, uint32_t>() {
+const GemmImplementation<uint16_t, uint32_t> *gemm_implementation_list<uint16_t, uint32_t>() {
return gemm_u16_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<uint16_t, uint32_t> gemm<uint16_t, uint32_t>(GemmArgs<uint32_t> &args, GemmConfig *cfg);
-template GemmMethod get_gemm_method<uint16_t, uint32_t>(GemmArgs<uint32_t> &args);
-template bool method_is_compatible<uint16_t, uint32_t>(GemmMethod method, GemmArgs<uint32_t> &args);
+template UniqueGemmCommon<uint16_t, uint32_t> gemm<uint16_t, uint32_t>(const GemmArgs<uint32_t> &args);
+template KernelDescription get_gemm_method<uint16_t, uint32_t>(const GemmArgs<uint32_t> &args);
+template bool method_is_compatible<uint16_t, uint32_t>(GemmMethod method, const GemmArgs<uint32_t> &args);
+template std::vector<std::string> get_compatible_kernels<uint16_t, uint32_t> (const GemmArgs<uint32_t> &args);
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // __aarch64__ \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index b7c1bab6bd..3c8df3f044 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,64 +27,66 @@
#include "gemm_common.hpp"
#include "gemm_implementation.hpp"
#include "gemm_interleaved.hpp"
+#include "gemm_native.hpp"
#include "kernels/a64_gemm_u16_12x8.hpp"
#include "kernels/a64_gemm_u8_12x8.hpp"
#include "kernels/a64_gemm_u8_4x4.hpp"
#include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp"
+#include "kernels/sve_native_u8u32_dot_4VLx4.hpp"
namespace arm_gemm {
+static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
#ifdef __ARM_FEATURE_SVE
-class GemmImpl_gemm_u8_interleaved_dot : public GemmImplementation<uint8_t, uint32_t> {
-public:
- UniqueGemmCommon<uint8_t, uint32_t> instantiate(const GemmArgs<uint32_t> &args) override {
- return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<interleaved_u8u32_dot_3VLx8, uint8_t, uint32_t>(args));
- }
-
- GemmImpl_gemm_u8_interleaved_dot() : GemmImplementation<uint8_t, uint32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
-};
-#else
-class GemmImpl_gemm_u8_interleaved_dot : public GemmImplementation<uint8_t, uint32_t> {
-public:
- bool is_supported(const GemmArgs<uint32_t> &args) override {
- return args._ci->has_dotprod();
- }
-
- UniqueGemmCommon<uint8_t, uint32_t> instantiate(const GemmArgs<uint32_t> &args) override {
- return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<gemm_u8_12x8, uint8_t, uint32_t>(args));
- }
-
- GemmImpl_gemm_u8_interleaved_dot() : GemmImplementation<uint8_t, uint32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
-};
+{
+ GemmMethod::GEMM_NATIVE,
+ "native_u8u32_dot_4VLx4",
+ [](const GemmArgs<uint32_t> &args) { return (args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB); },
+ [](const GemmArgs<uint32_t> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
+ [](const GemmArgs<uint32_t> &args) { return new GemmNative<native_u8u32_dot_4VLx4, uint8_t, uint32_t>(args); }
+},
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "interleaved_u8u32_dot_3VLx8",
+ [](const GemmArgs<uint32_t> &args) { return (args._Ksize>4); },
+ nullptr,
+ [](const GemmArgs<uint32_t> &args) { return new GemmInterleaved<interleaved_u8u32_dot_3VLx8, uint8_t, uint32_t>(args); }
+},
#endif
-
-class GemmImpl_gemm_u8_interleaved : public GemmImplementation<uint8_t, uint32_t> {
-public:
- UniqueGemmCommon<uint8_t, uint32_t> instantiate(const GemmArgs<uint32_t> &args) override {
- return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t>(args));
- }
-
- GemmImpl_gemm_u8_interleaved() : GemmImplementation<uint8_t, uint32_t>(GemmMethod::GEMM_INTERLEAVED) { }
-};
-
-static GemmImpl_gemm_u8_interleaved_dot gemm_u8_interleaved_dot_impl{};
-static GemmImpl_gemm_u8_interleaved gemm_u8_interleaved_impl{};
-
-static std::vector<GemmImplementation<uint8_t, uint32_t> *> gemm_u8_methods = {
- &gemm_u8_interleaved_dot_impl,
- &gemm_u8_interleaved_impl
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "gemm_u8_12x8",
+ [](const GemmArgs<uint32_t> &args) { return args._ci->has_dotprod(); },
+ nullptr,
+ [](const GemmArgs<uint32_t> &args) { return new GemmInterleaved<gemm_u8_12x8, uint8_t, uint32_t>(args); }
+},
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "gemm_u8_4x4",
+ nullptr,
+ nullptr,
+ [](const GemmArgs<uint32_t> &args) { return new GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t>(args); }
+},
+{
+ GemmMethod::DEFAULT,
+ "",
+ nullptr,
+ nullptr,
+ nullptr
+}
};
template<>
-std::vector<GemmImplementation<uint8_t, uint32_t> *> &gemm_implementation_list<uint8_t, uint32_t>() {
+const GemmImplementation<uint8_t, uint32_t> *gemm_implementation_list<uint8_t, uint32_t>() {
return gemm_u8_methods;
}
/* Explicitly instantiate the external functions for these types. */
-template UniqueGemmCommon<uint8_t, uint32_t> gemm<uint8_t, uint32_t>(GemmArgs<uint32_t> &args, GemmConfig *cfg);
-template GemmMethod get_gemm_method<uint8_t, uint32_t>(GemmArgs<uint32_t> &args);
-template bool method_is_compatible<uint8_t, uint32_t>(GemmMethod method, GemmArgs<uint32_t> &args);
+template UniqueGemmCommon<uint8_t, uint32_t> gemm<uint8_t, uint32_t>(const GemmArgs<uint32_t> &args);
+template KernelDescription get_gemm_method<uint8_t, uint32_t>(const GemmArgs<uint32_t> &args);
+template bool method_is_compatible<uint8_t, uint32_t>(GemmMethod method, const GemmArgs<uint32_t> &args);
+template std::vector<std::string> get_compatible_kernels<uint8_t, uint32_t> (const GemmArgs<uint32_t> &args);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
index d65971e47d..40f7f2b7cd 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,7 +41,7 @@ public:
GemmArgs<Tr> newargs = args;
newargs._Msize = args._nbatches;
newargs._nbatches = 1;
- _subgemm = gemm<To,Tr>(newargs, nullptr);
+ _subgemm = gemm<To,Tr>(newargs);
}
void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
index 241c5fea27..5cf42761e6 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -64,10 +64,11 @@ public:
GemvNativeTransposed(GemvNativeTransposed &) = delete;
GemvNativeTransposed & operator= (GemvNativeTransposed &) = delete;
- GemvNativeTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const Tr beta) : _Nsize(N), _Ksize(K), _nmultis(nmultis), _beta(beta), _ci(ci) {
+ GemvNativeTransposed(const GemmArgs<Tr> &args)
+ : _Nsize(args._Nsize), _Ksize(args._Ksize), _nmultis(args._nmulti), _beta(args._beta), _ci(args._ci) {
/* For now don't do any blocking. TODO: figure out if we should. */
- m_block = K;
- n_block = N;
+ m_block = _Ksize;
+ n_block = _Nsize;
}
// Window is number of out_width blocks times number of multis.
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index e53ddb26c1..842339ef23 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -68,12 +68,21 @@ public:
GemvPretransposed(GemvPretransposed &) = delete;
GemvPretransposed & operator= (GemvPretransposed &) = delete;
- GemvPretransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const bool trB, const Tr beta) :
- _Nsize(N), _Ksize(K), _nmultis(nmultis), _trB(trB), _beta(beta), _ci(ci),
- _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave) * strategy::A_interleave) {
+ GemvPretransposed(const GemmArgs<Tr> &args)
+ : _Nsize(args._Nsize), _Ksize(args._Ksize), _nmultis(args._nmulti), _trB(args._trB), _beta(args._beta), _ci(args._ci),
+ _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave) * strategy::A_interleave) {
/* For now don't do any blocking. TODO: figure out if we should. */
- m_block = K;
- n_block = N;
+ if (args._cfg && args._cfg->inner_block_size) {
+ m_block = args._cfg->inner_block_size;
+ } else {
+ m_block = _Ksize;
+ }
+
+ if (args._cfg && args._cfg->outer_block_size) {
+ n_block = args._cfg->outer_block_size;
+ } else {
+ n_block = _Nsize;
+ }
}
// Window is number of out_width blocks, times number of multis.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
index 418a375a61..4ad38cbf62 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,9 +32,9 @@
// Kernel implementation.
//
// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order.
// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 12x8), the chunks being arranged in a row major fashion.
+// 24x8), the chunks being arranged in a row major fashion.
//
// Note that the intent of this is that either ablocks or bblocks will be 1
// - this construction allows the output loop to proceed in either order.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp
new file mode 100644
index 0000000000..0c387ff6df
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm {
+
+// Actual kernel implementations
+void a64_sgemm_nativeA_pretransposeB_16x4(const float *, int, const float *, float *, int, float, unsigned int, unsigned int, unsigned int);
+
+// Native A/Pretranspose B SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics. The actual
+// kernel to be used can be chosen at runtime, based on the CPUInfo
+// structure.
+class sgemm_nativeA_pretransposeB_16x4 {
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, float *, int, float, unsigned int, unsigned int, unsigned int);
+
+ /* Desired data layout for B buffer (used for pretranspose) */
+ static const int B_interleave = 16;
+ static const int B_block = 1;
+ static const bool B_transpose = true;
+
+ /* Kernel blocking parameters */
+ static int out_width() {
+ return 16;
+ }
+
+ static int out_height() {
+ return 4;
+ }
+
+ static int k_unroll() {
+ return 1;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 4, 16> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_sgemm_nativeA_pretransposeB_16x4;
+
+ sgemm_nativeA_pretransposeB_16x4(const CPUInfo *ci) {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp
new file mode 100644
index 0000000000..b2516f8797
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp
@@ -0,0 +1,970 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstring>
+
+#include <arm_neon.h>
+
+namespace arm_gemm {
+
+void a64_sgemm_nativeA_pretransposeB_16x4(const float *A, int lda, const float *B_panel, float *C, int ldc, float beta, unsigned int numrows, unsigned int numcols, unsigned int K) {
+ const bool oddk = ((K % 8) >= 4);
+ const bool beta0 = (beta == 0.0f);
+ const unsigned int oddones = (K % 4);
+
+ /* Use some small temporary arrays to cope with "ragged" M/N sizes.
+ *
+ * "dummy_A_buf" is used to avoid overreading the A input for ragged M,
+ * and also for output if N is not ragged.
+ *
+ * Since the B input is pretransposed it will be padded as needed, so no
+ * need to worry about overreading that.
+ *
+ * "C_buf" is used to avoid overreading or overwriting the output for
+ * ragged N cases.
+ */
+ float dummy_A_buf[16];
+ float C_buf[64];
+
+ std::memset(dummy_A_buf, 0, sizeof(dummy_A_buf));
+ std::memset(C_buf, 0, sizeof(C_buf));
+
+ for (unsigned int y=0; y<numrows; y+=4) {
+ const float *b_ptr = B_panel;
+ const unsigned int active_rows = std::min(numrows - y, 4U);
+
+ /* Increment values to be used to advance A pointers - these get set
+ * to zero when the corresponding row isn't being used due to ragged
+ * M, so it will just read the dummy buffer repeatedly. Values are
+ * in bytes (8x sizeof(float)). */
+ const unsigned long a_incr1 = (active_rows > 1) ? 32 : 0;
+ const unsigned long a_incr2 = (active_rows > 2) ? 32 : 0;
+ const unsigned long a_incr3 = (active_rows > 3) ? 32 : 0;
+
+ /* Starting points for A pointers on this loop */
+ const float * const a_ptr0_base = A + (y * lda);
+ const float * const a_ptr1_base = (active_rows > 1) ? (a_ptr0_base + lda) : dummy_A_buf;
+ const float * const a_ptr2_base = (active_rows > 2) ? (a_ptr1_base + lda) : dummy_A_buf;
+ const float * const a_ptr3_base = (active_rows > 3) ? (a_ptr2_base + lda) : dummy_A_buf;
+
+ /* Starting points for C pointers on this loop */
+ float *c_ptr0 = C + (y * ldc);
+ float *c_ptr1 = (active_rows > 1) ? (c_ptr0 + ldc) : dummy_A_buf;
+ float *c_ptr2 = (active_rows > 2) ? (c_ptr1 + ldc) : dummy_A_buf;
+ float *c_ptr3 = (active_rows > 3) ? (c_ptr2 + ldc) : dummy_A_buf;
+
+ for (unsigned int x0=0; x0<numcols; x0+=16) {
+ const unsigned int active_cols = std::min(numcols - x0, 16U);
+ const bool use_result_buf = (active_cols < 16);
+
+ /* Reset the A pointers for this loop. */
+ const float *a_ptr0 = a_ptr0_base;
+ const float *a_ptr1 = a_ptr1_base;
+ const float *a_ptr2 = a_ptr2_base;
+ const float *a_ptr3 = a_ptr3_base;
+
+ /* Override C pointers if the result buffer is in use. */
+ if (use_result_buf) {
+ c_ptr0 = C_buf;
+ c_ptr1 = C_buf + 16;
+ c_ptr2 = C_buf + 32;
+ c_ptr3 = C_buf + 48;
+
+ /* If beta is non-zero, prepopulate the result buffer */
+ if (!beta0) {
+ for (unsigned int row=0; row<active_rows; row++) {
+ for (unsigned int col=0; col<active_cols; col++) {
+ C_buf[row * 16 + col] = C[((y + row) * ldc) + (x0 + col)];
+ }
+ }
+ }
+ }
+
+ unsigned int loops = ((K+4)/8) - 1;
+ unsigned int odds = oddones;
+
+ __asm __volatile (
+ "a0 .req v0\n"
+ "a1 .req v1\n"
+ "a2 .req v2\n"
+ "a3 .req v3\n"
+ "a0a .req v4\n"
+ "a1a .req v5\n"
+ "a2a .req v6\n"
+ "a3a .req v7\n"
+ "bb0 .req v8\n"
+ "bb1 .req v9\n"
+ "bb2 .req v10\n"
+ "bb3 .req v11\n"
+ "b0a .req v12\n"
+ "b1a .req v13\n"
+ "b2a .req v14\n"
+ "b3a .req v15\n"
+
+ "a0q .req q0\n"
+ "a1q .req q1\n"
+ "a2q .req q2\n"
+ "a3q .req q3\n"
+ "a0aq .req q4\n"
+ "a1aq .req q5\n"
+ "a2aq .req q6\n"
+ "a3aq .req q7\n"
+ "b0q .req q8\n"
+ "b1q .req q9\n"
+ "b2q .req q10\n"
+ "b3q .req q11\n"
+ "b0aq .req q12\n"
+ "b1aq .req q13\n"
+ "b2aq .req q14\n"
+ "b3aq .req q15\n"
+
+ "movi v16.4s, #0x0\n"
+ "ldr a0q, [%[a_ptr0]]\n"
+ "movi v17.4s, #0x0\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "movi v18.4s, #0x0\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+ "movi v19.4s, #0x0\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "movi v20.4s, #0x0\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "movi v21.4s, #0x0\n"
+ "ldr a1q, [%[a_ptr1]]\n"
+ "movi v22.4s, #0x0\n"
+ "ldr a2q, [%[a_ptr2]]\n"
+ "movi v23.4s, #0x0\n"
+ "ldr a3q, [%[a_ptr3]]\n"
+ "movi v24.4s, #0x0\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+ "movi v25.4s, #0x0\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+ "movi v26.4s, #0x0\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+ "cbz %w[beta0], 5f\n"
+ "movi v27.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #0x40]")
+ "movi v28.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #0x80]")
+ "movi v29.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #0xC0]")
+ "movi v30.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #0x100]")
+ "movi v31.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #0x140]")
+ ASM_PREFETCH("[%[b_ptr], #0x180]")
+ ASM_PREFETCH("[%[b_ptr], #0x1C0]")
+ ASM_PREFETCH("[%[b_ptr], #0x200]")
+
+ // Skip if no complete loops.
+ "cbz %w[loops], 4f\n"
+ "b 1f\n"
+
+ // If beta is non-zero, need to load and multiply by beta
+ "5:\n"
+ "ld1r {v4.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #16]\n"
+ "ldr q18, [%[c_ptr0], #32]\n"
+ "ldr q19, [%[c_ptr0], #48]\n"
+
+ "ldr q20, [%[c_ptr1]]\n"
+ "fmul v16.4s, v16.4s, v4.4s\n"
+ "ldr q21, [%[c_ptr1], #16]\n"
+ "fmul v17.4s, v17.4s, v4.4s\n"
+ "ldr q22, [%[c_ptr1], #32]\n"
+ "fmul v18.4s, v18.4s, v4.4s\n"
+ "ldr q23, [%[c_ptr1], #48]\n"
+ "fmul v19.4s, v19.4s, v4.4s\n"
+
+ "ldr q24, [%[c_ptr2]]\n"
+ "fmul v20.4s, v20.4s, v4.4s\n"
+ "ldr q25, [%[c_ptr2], #16]\n"
+ "fmul v21.4s, v21.4s, v4.4s\n"
+ "ldr q26, [%[c_ptr2], #32]\n"
+ "fmul v22.4s, v22.4s, v4.4s\n"
+ "ldr q27, [%[c_ptr2], #48]\n"
+ "fmul v23.4s, v23.4s, v4.4s\n"
+
+ "ldr q28, [%[c_ptr3]]\n"
+ "fmul v24.4s, v24.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x40]")
+ "ldr q29, [%[c_ptr3], #16]\n"
+ "fmul v25.4s, v25.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x80]")
+ "ldr q30, [%[c_ptr3], #32]\n"
+ "fmul v26.4s, v26.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0xC0]")
+ "ldr q31, [%[c_ptr3], #48]\n"
+ "fmul v27.4s, v27.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x100]")
+
+ "fmul v28.4s, v28.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x140]")
+ "fmul v29.4s, v29.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x180]")
+ "fmul v30.4s, v30.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x1C0]")
+ "fmul v31.4s, v31.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x200]")
+
+ "cbz %w[loops], 4f\n"
+
+ "1:\n"
+ // Unroll 0
+ "fmla v16.4s, bb0.4s, a0.s[0]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x240]")
+ "fmla v20.4s, bb0.4s, a1.s[0]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+ "fmla v24.4s, bb0.4s, a2.s[0]\n"
+ "fmla v28.4s, bb0.4s, a3.s[0]\n"
+ "ldr b0q, [%[b_ptr], #64]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[0]\n"
+ "fmla v21.4s, bb1.4s, a1.s[0]\n"
+ "ldr a0aq, [%[a_ptr0], #16]\n"
+ "fmla v25.4s, bb1.4s, a2.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3.s[0]\n"
+ "ldr b1q, [%[b_ptr], #80]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1.s[0]\n"
+ "ldr a1aq, [%[a_ptr1], #16]\n"
+ "fmla v26.4s, bb2.4s, a2.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3.s[0]\n"
+ "ldr b2q, [%[b_ptr], #96]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1.s[0]\n"
+ "ldr a2aq, [%[a_ptr2], #16]\n"
+ "fmla v27.4s, bb3.4s, a2.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3.s[0]\n"
+ "ldr b3q, [%[b_ptr], #112]\n"
+
+ // Unroll 1
+ "fmla v16.4s, b0a.4s, a0.s[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x280]")
+ "fmla v20.4s, b0a.4s, a1.s[1]\n"
+ "ldr a3aq, [%[a_ptr3], #16]\n"
+ "fmla v24.4s, b0a.4s, a2.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3.s[1]\n"
+ "ldr b0aq, [%[b_ptr], #128]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1.s[1]\n"
+ "subs %w[loops], %w[loops], #1\n"
+ "fmla v25.4s, b1a.4s, a2.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #144]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #160]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #176]\n"
+
+ // Unroll 2
+ "fmla v16.4s, bb0.4s, a0.s[2]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x2C0]")
+ "fmla v20.4s, bb0.4s, a1.s[2]\n"
+ "fmla v24.4s, bb0.4s, a2.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3.s[2]\n"
+ "ldr b0q, [%[b_ptr], #192]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[2]\n"
+ "add %[a_ptr0], %[a_ptr0], #32\n"
+ "fmla v21.4s, bb1.4s, a1.s[2]\n"
+ "add %[a_ptr1], %[a_ptr1], %[a_incr1]\n"
+ "fmla v25.4s, bb1.4s, a2.s[2]\n"
+ "add %[a_ptr2], %[a_ptr2], %[a_incr2]\n"
+ "fmla v29.4s, bb1.4s, a3.s[2]\n"
+ "ldr b1q, [%[b_ptr], #208]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[2]\n"
+ "add %[a_ptr3], %[a_ptr3], %[a_incr3]\n"
+ "fmla v22.4s, bb2.4s, a1.s[2]\n"
+ ASM_PREFETCH("[%[a_ptr0], #0x40]")
+ "fmla v26.4s, bb2.4s, a2.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3.s[2]\n"
+ "ldr b2q, [%[b_ptr], #224]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1.s[2]\n"
+ ASM_PREFETCH("[%[a_ptr1], #0x40]")
+ "fmla v27.4s, bb3.4s, a2.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3.s[2]\n"
+ "ldr b3q, [%[b_ptr], #240]\n"
+
+ // Unroll 3
+ "fmla v16.4s, b0a.4s, a0.s[3]\n"
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "add %[b_ptr], %[b_ptr], #512\n"
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "ldr b0aq, [%[b_ptr], #-256]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[3]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x100]")
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "ldr b1aq, [%[b_ptr], #-240]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[3]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ ASM_PREFETCH("[%[a_ptr2], #0x40]")
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "ldr b2aq, [%[b_ptr], #-224]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "ldr a0q, [%[a_ptr0]]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+ "ldr b3aq, [%[b_ptr], #-208]\n"
+
+ // Unroll 4
+ "fmla v16.4s, bb0.4s, a0a.s[0]\n"
+ "fmla v20.4s, bb0.4s, a1a.s[0]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x140]")
+ "fmla v24.4s, bb0.4s, a2a.s[0]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[0]\n"
+ "ldr b0q, [%[b_ptr], #-192]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[0]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[0]\n"
+ "ldr a1q, [%[a_ptr1]]\n"
+ "fmla v25.4s, bb1.4s, a2a.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[0]\n"
+ "ldr b1q, [%[b_ptr], #-176]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[0]\n"
+ "ldr a2q, [%[a_ptr2]]\n"
+ "fmla v26.4s, bb2.4s, a2a.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[0]\n"
+ "ldr b2q, [%[b_ptr], #-160]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[0]\n"
+ "ldr a3q, [%[a_ptr3]]\n"
+ "fmla v27.4s, bb3.4s, a2a.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[0]\n"
+ "ldr b3q, [%[b_ptr], #-144]\n"
+
+ // Unroll 5
+ "fmla v16.4s, b0a.4s, a0a.s[1]\n"
+ "fmla v20.4s, b0a.4s, a1a.s[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x180]")
+ "fmla v24.4s, b0a.4s, a2a.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3a.s[1]\n"
+ "ldr b0aq, [%[b_ptr], #-128]\n"
+
+ "fmla v17.4s, b1a.4s, a0a.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[1]\n"
+ ASM_PREFETCH("[%[a_ptr3], #0x40]")
+ "fmla v25.4s, b1a.4s, a2a.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #-112]\n"
+
+ "fmla v18.4s, b2a.4s, a0a.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #-96]\n"
+
+ "fmla v19.4s, b3a.4s, a0a.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #-80]\n"
+
+ // Unroll 6
+ "fmla v16.4s, bb0.4s, a0a.s[2]\n"
+ "fmla v20.4s, bb0.4s, a1a.s[2]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x1C0]")
+ "fmla v24.4s, bb0.4s, a2a.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[2]\n"
+ "ldr b0q, [%[b_ptr], #-64]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[2]\n"
+ "fmla v25.4s, bb1.4s, a2a.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[2]\n"
+ "ldr b1q, [%[b_ptr], #-48]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[2]\n"
+ "fmla v26.4s, bb2.4s, a2a.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[2]\n"
+ "ldr b2q, [%[b_ptr], #-32]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[2]\n"
+ "fmla v27.4s, bb3.4s, a2a.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[2]\n"
+ "ldr b3q, [%[b_ptr], #-16]\n"
+
+ // Unroll 7
+ "fmla v16.4s, b0a.4s, a0a.s[3]\n"
+ "fmla v20.4s, b0a.4s, a1a.s[3]\n"
+ "fmla v24.4s, b0a.4s, a2a.s[3]\n"
+ "fmla v28.4s, b0a.4s, a3a.s[3]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0a.s[3]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[3]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x200]")
+ "fmla v25.4s, b1a.4s, a2a.s[3]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[3]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0a.s[3]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[3]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[3]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0a.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[3]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[3]\n"
+ "bne 1b\n"
+
+ // Skip to here
+ "4:\n"
+
+ // Detached final iteration
+ // Unroll 0
+ "fmla v16.4s, bb0.4s, a0.s[0]\n"
+ "fmla v20.4s, bb0.4s, a1.s[0]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+ "fmla v24.4s, bb0.4s, a2.s[0]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v28.4s, bb0.4s, a3.s[0]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[0]\n"
+ "cbnz %w[oddk], 2f\n" // Deal with odd K before we load a0a
+ "fmla v21.4s, bb1.4s, a1.s[0]\n"
+ "ldr a0aq, [%[a_ptr0], #16]\n"
+ "fmla v25.4s, bb1.4s, a2.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3.s[0]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1.s[0]\n"
+ "ldr a1aq, [%[a_ptr1], #16]\n"
+ "fmla v26.4s, bb2.4s, a2.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3.s[0]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1.s[0]\n"
+ "ldr a2aq, [%[a_ptr2], #16]\n"
+ "fmla v27.4s, bb3.4s, a2.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3.s[0]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 1
+ "fmla v16.4s, b0a.4s, a0.s[1]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v20.4s, b0a.4s, a1.s[1]\n"
+ "ldr a3aq, [%[a_ptr3], #16]\n"
+ "fmla v24.4s, b0a.4s, a2.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3.s[1]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #32\n"
+ "fmla v21.4s, b1a.4s, a1.s[1]\n"
+ "add %[a_ptr1], %[a_ptr1], %[a_incr1]\n"
+ "fmla v25.4s, b1a.4s, a2.s[1]\n"
+ "add %[a_ptr2], %[a_ptr2], %[a_incr2]\n"
+ "fmla v29.4s, b1a.4s, a3.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1.s[1]\n"
+ "add %[a_ptr3], %[a_ptr3], %[a_incr3]\n"
+ "fmla v26.4s, b2a.4s, a2.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 2
+ "fmla v16.4s, bb0.4s, a0.s[2]\n"
+ "fmla v20.4s, bb0.4s, a1.s[2]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v24.4s, bb0.4s, a2.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3.s[2]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1.s[2]\n"
+ "fmla v25.4s, bb1.4s, a2.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3.s[2]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1.s[2]\n"
+ "fmla v26.4s, bb2.4s, a2.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3.s[2]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1.s[2]\n"
+ "fmla v27.4s, bb3.4s, a2.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3.s[2]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 3
+ "fmla v16.4s, b0a.4s, a0.s[3]\n"
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[3]\n"
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[3]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 4
+ "fmla v16.4s, bb0.4s, a0a.s[0]\n"
+ "fmla v20.4s, bb0.4s, a1a.s[0]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v24.4s, bb0.4s, a2a.s[0]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[0]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[0]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[0]\n"
+ "fmla v25.4s, bb1.4s, a2a.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[0]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[0]\n"
+ "fmla v26.4s, bb2.4s, a2a.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[0]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[0]\n"
+ "fmla v27.4s, bb3.4s, a2a.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[0]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 5
+ "fmla v16.4s, b0a.4s, a0a.s[1]\n"
+ "fmla v20.4s, b0a.4s, a1a.s[1]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v24.4s, b0a.4s, a2a.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3a.s[1]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0a.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[1]\n"
+ "fmla v25.4s, b1a.4s, a2a.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0a.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0a.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 6
+ "fmla v16.4s, bb0.4s, a0a.s[2]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v20.4s, bb0.4s, a1a.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr0], #0x40]")
+ "fmla v24.4s, bb0.4s, a2a.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[2]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr1], #0x40]")
+ "fmla v25.4s, bb1.4s, a2a.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[2]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr2], #0x40]")
+ "fmla v26.4s, bb2.4s, a2a.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[2]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr3], #0x40]")
+ "fmla v27.4s, bb3.4s, a2a.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[2]\n"
+
+ // Unroll 7
+ "fmla v16.4s, b0a.4s, a0a.s[3]\n"
+ "fmla v17.4s, b1a.4s, a0a.s[3]\n"
+ "fmla v18.4s, b2a.4s, a0a.s[3]\n"
+ "fmla v19.4s, b3a.4s, a0a.s[3]\n"
+ "cbnz %w[odds], 6f\n"
+
+ "fmla v20.4s, b0a.4s, a1a.s[3]\n"
+ "str q16, [%[c_ptr0]]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[3]\n"
+ "str q17, [%[c_ptr0], #16]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[3]\n"
+ "str q18, [%[c_ptr0], #32]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[3]\n"
+ "str q19, [%[c_ptr0], #48]\n"
+
+ "fmla v24.4s, b0a.4s, a2a.s[3]\n"
+ "str q20, [%[c_ptr1]]\n"
+ "fmla v25.4s, b1a.4s, a2a.s[3]\n"
+ "str q21, [%[c_ptr1], #16]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[3]\n"
+ "str q22, [%[c_ptr1], #32]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[3]\n"
+ "str q23, [%[c_ptr1], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3a.s[3]\n"
+ "str q24, [%[c_ptr2]]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[3]\n"
+ "str q25, [%[c_ptr2], #16]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[3]\n"
+ "str q26, [%[c_ptr2], #32]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[3]\n"
+ "str q27, [%[c_ptr2], #48]\n"
+ "b 3f\n"
+
+ // Odd K case: Just do 4 more.
+ "2:\n"
+ "fmla v21.4s, bb1.4s, a1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #16\n"
+ "fmla v25.4s, bb1.4s, a2.s[0]\n"
+ "add %[a_ptr1], %[a_ptr1], #16\n"
+ "fmla v29.4s, bb1.4s, a3.s[0]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[0]\n"
+ "add %[a_ptr2], %[a_ptr2], #16\n"
+ "fmla v22.4s, bb2.4s, a1.s[0]\n"
+ "add %[a_ptr3], %[a_ptr3], #16\n"
+ "fmla v26.4s, bb2.4s, a2.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3.s[0]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1.s[0]\n"
+ "fmla v27.4s, bb3.4s, a2.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3.s[0]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 1
+ "fmla v16.4s, b0a.4s, a0.s[1]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v20.4s, b0a.4s, a1.s[1]\n"
+ "fmla v24.4s, b0a.4s, a2.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3.s[1]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1.s[1]\n"
+ "fmla v25.4s, b1a.4s, a2.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 2
+ "fmla v16.4s, bb0.4s, a0.s[2]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v20.4s, bb0.4s, a1.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr0], #0x40]")
+ "fmla v24.4s, bb0.4s, a2.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3.s[2]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr1], #0x40]")
+ "fmla v25.4s, bb1.4s, a2.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3.s[2]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr2], #0x40]")
+ "fmla v26.4s, bb2.4s, a2.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3.s[2]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr3], #0x40]")
+ "fmla v27.4s, bb3.4s, a2.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3.s[2]\n"
+
+ // Unroll 3
+ "fmla v16.4s, b0a.4s, a0.s[3]\n"
+ "fmla v17.4s, b1a.4s, a0.s[3]\n"
+ "fmla v18.4s, b2a.4s, a0.s[3]\n"
+ "fmla v19.4s, b3a.4s, a0.s[3]\n"
+ "cbnz %w[odds], 7f\n"
+
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "str q16, [%[c_ptr0]]\n"
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "str q17, [%[c_ptr0], #16]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ "str q18, [%[c_ptr0], #32]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "str q19, [%[c_ptr0], #48]\n"
+
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "str q20, [%[c_ptr1]]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "str q21, [%[c_ptr1], #16]\n"
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "str q22, [%[c_ptr1], #32]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "str q23, [%[c_ptr1], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "str q24, [%[c_ptr2]]\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "str q25, [%[c_ptr2], #16]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "str q26, [%[c_ptr2], #32]\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+ "str q27, [%[c_ptr2], #48]\n"
+ "b 3f\n"
+
+ // "Odd ones" - lead in from even
+ "6:\n"
+ "fmla v20.4s, b0a.4s, a1a.s[3]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[3]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[3]\n"
+ "subs %w[odds], %w[odds], #1\n"
+ "fmla v23.4s, b3a.4s, a1a.s[3]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v24.4s, b0a.4s, a2a.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2a.s[3]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[3]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3a.s[3]\n"
+ "ld1r {a0.4s}, [%[a_ptr0]], #4\n"
+ "fmla v29.4s, b1a.4s, a3a.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[3]\n"
+ "ld1r {a1.4s}, [%[a_ptr1]], #4\n"
+ "fmla v31.4s, b3a.4s, a3a.s[3]\n"
+
+ "fmla v16.4s, bb0.4s, a0.4s\n"
+ "beq 9f\n"
+ "b 8f\n"
+
+ // "Odd ones" - lead in from odd
+ "7:\n"
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "subs %w[odds], %w[odds], #1\n"
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "ld1r {a0.4s}, [%[a_ptr0]], #4\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "ld1r {a1.4s}, [%[a_ptr1]], #4\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+
+ "fmla v16.4s, bb0.4s, a0.4s\n"
+ "beq 9f\n"
+
+ // "Odd ones" - loop
+ "8:\n"
+ "fmla v17.4s, bb1.4s, a0.4s\n"
+ "ld1r {a2.4s}, [%[a_ptr2]], #4\n"
+ "fmla v18.4s, bb2.4s, a0.4s\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v19.4s, bb3.4s, a0.4s\n"
+ "ld1r {a3.4s}, [%[a_ptr3]], #4\n"
+
+ "fmla v20.4s, bb0.4s, a1.4s\n"
+ "subs %w[odds], %w[odds], #1\n"
+ "fmla v21.4s, bb1.4s, a1.4s\n"
+ "ld1r {a0.4s}, [%[a_ptr0]], #4\n"
+ "fmla v22.4s, bb2.4s, a1.4s\n"
+ "fmla v23.4s, bb3.4s, a1.4s\n"
+ "ld1r {a1.4s}, [%[a_ptr1]], #4\n"
+
+ "fmla v24.4s, bb0.4s, a2.4s\n"
+ "fmla v28.4s, bb0.4s, a3.4s\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "fmla v25.4s, bb1.4s, a2.4s\n"
+ "fmla v29.4s, bb1.4s, a3.4s\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v26.4s, bb2.4s, a2.4s\n"
+ "fmla v30.4s, bb2.4s, a3.4s\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "fmla v27.4s, bb3.4s, a2.4s\n"
+ "fmla v31.4s, bb3.4s, a3.4s\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+ "fmla v16.4s, bb0.4s, a0.4s\n"
+ "bne 8b\n"
+
+ // "Odd ones" - detached final iteration
+ "9:\n"
+ "fmla v17.4s, bb1.4s, a0.4s\n"
+ "ld1r {a2.4s}, [%[a_ptr2]], #4\n"
+ "fmla v18.4s, bb2.4s, a0.4s\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v19.4s, bb3.4s, a0.4s\n"
+ "ld1r {a3.4s}, [%[a_ptr3]], #4\n"
+
+ "fmla v20.4s, bb0.4s, a1.4s\n"
+ "str q16, [%[c_ptr0]]\n"
+ "fmla v21.4s, bb1.4s, a1.4s\n"
+ "str q17, [%[c_ptr0], #16]\n"
+ "fmla v22.4s, bb2.4s, a1.4s\n"
+ "str q18, [%[c_ptr0], #32]\n"
+ "fmla v23.4s, bb3.4s, a1.4s\n"
+ "str q19, [%[c_ptr0], #48]\n"
+
+ "fmla v24.4s, bb0.4s, a2.4s\n"
+ "str q20, [%[c_ptr1]]\n"
+ "fmla v25.4s, bb1.4s, a2.4s\n"
+ "str q21, [%[c_ptr1], #16]\n"
+ "fmla v26.4s, bb2.4s, a2.4s\n"
+ "str q22, [%[c_ptr1], #32]\n"
+ "fmla v27.4s, bb3.4s, a2.4s\n"
+ "str q23, [%[c_ptr1], #48]\n"
+
+ "fmla v28.4s, bb0.4s, a3.4s\n"
+ "str q24, [%[c_ptr2]]\n"
+ "fmla v29.4s, bb1.4s, a3.4s\n"
+ "str q25, [%[c_ptr2], #16]\n"
+ "fmla v30.4s, bb2.4s, a3.4s\n"
+ "str q26, [%[c_ptr2], #32]\n"
+ "fmla v31.4s, bb3.4s, a3.4s\n"
+ "str q27, [%[c_ptr2], #48]\n"
+
+ "3:\n"
+ "str q28, [%[c_ptr3]]\n"
+ // Increment C pointers for next loop - this looks odd if we
+ // are using the result buffer, but it's OK as using the
+ // result buffer implies there will be no next loop.
+ "add %[c_ptr0], %[c_ptr0], #64\n"
+ "str q29, [%[c_ptr3], #16]\n"
+ "add %[c_ptr1], %[c_ptr1], %[a_incr1], LSL #1\n"
+ "str q30, [%[c_ptr3], #32]\n"
+ "add %[c_ptr2], %[c_ptr2], %[a_incr2], LSL #1\n"
+ "str q31, [%[c_ptr3], #48]\n"
+ "add %[c_ptr3], %[c_ptr3], %[a_incr3], LSL #1\n"
+
+ : [a_ptr0] "+r" (a_ptr0), [a_ptr1] "+r" (a_ptr1), [a_ptr2] "+r" (a_ptr2), [a_ptr3] "+r" (a_ptr3),
+ [b_ptr] "+r" (b_ptr), [loops] "+r" (loops), [odds] "+r" (odds),
+ [c_ptr0] "+r" (c_ptr0), [c_ptr1] "+r" (c_ptr1), [c_ptr2] "+r" (c_ptr2), [c_ptr3] "+r" (c_ptr3)
+ : [oddk] "r" (oddk), [beta0] "r" (beta0), [betaptr] "r" (&beta),
+ [a_incr1] "r" (a_incr1), [a_incr2] "r" (a_incr2), [a_incr3] "r" (a_incr3)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+ "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+ "cc", "memory"
+ );
+
+ /* Copy results from result buffer if needed. */
+ if (use_result_buf) {
+ for (unsigned int row=0; row<active_rows; row++) {
+ for (unsigned int col=0; col<active_cols; col++) {
+ C[((y + row) * ldc) + (x0 + col)] = C_buf[row * 16 + col];
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__ \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
new file mode 100644
index 0000000000..2b58b110c0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_fp32_mla_4VLx4(const float *, int, const float *, float *, int, float, int, int, int);
+
+class hybrid_fp32_mla_4VLx4
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, float *, int, float, int, int, int);
+
+ /* Kernel blocking parameters */
+ static int out_height()
+ {
+ return 4;
+ }
+
+ static int out_width()
+ {
+ return get_vector_length<float>() * 4;
+ }
+
+ static int k_unroll()
+ {
+ return 1;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 4, 4, 1> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_hybrid_fp32_mla_4VLx4;
+
+ hybrid_fp32_mla_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
new file mode 100644
index 0000000000..b8aa8252d1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
@@ -0,0 +1,2005 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32_mla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+ const int K_stride = K;
+ const long loops_count = ((K + 4) / 8) - 1;
+ K -= loops_count * 8;
+ const long regs_count = (K / 4) - 1;
+ K -= (regs_count + 1) * 4;
+ const long leftovers = K;
+
+ for (int y=0; y<M; y+=4) {
+ const float * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(float);
+
+ float *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(float);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
+ const float *betaptr = &beta;
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = leftovers;
+ const float *a_ptr0 = a_ptr0_base;
+ const float *b_ptr0 = B + (K_stride * x0);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "mov z18.s, #0\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z19.s, #0\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "2:\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "mov z19.s, #0\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z20.s, #0\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "fmul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "fmul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "2:\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "mov z20.s, #0\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z24.s, #0\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "mov z25.s, #0\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "mov z26.s, #0\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "mov z27.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "fmul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "fmul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "fmul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "fmul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "2:\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "mov z20.s, #0\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "mov z21.s, #0\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z24.s, #0\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z25.s, #0\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "mov z26.s, #0\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "mov z27.s, #0\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "mov z28.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z29.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov z30.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov z31.s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "fmul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "fmul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "fmul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "fmul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "fmul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "fmul z28.s, p7/m, z28.s, z15.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "fmul z29.s, p7/m, z29.s, z15.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "fmul z30.s, p7/m, z30.s, z15.s\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "2:\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z28.s, z12.s, z3.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z29.s, z13.s, z3.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z30.s, z14.s, z3.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z7.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z29.s, z9.s, z7.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z30.s, z10.s, z7.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "fmla z31.s, z11.s, z7.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z28.s, z12.s, z7.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z29.s, z13.s, z7.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z30.s, z14.s, z7.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "fmla z31.s, z15.s, z7.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z28.s, z8.s, z7.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z29.s, z9.s, z7.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z30.s, z10.s, z7.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z31.s, z11.s, z7.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z28.s, z12.s, z7.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "fmla z29.s, z13.s, z7.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "fmla z30.s, z14.s, z7.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "fmla z31.s, z15.s, z7.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z28.s, z12.s, z3.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z29.s, z13.s, z3.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z30.s, z14.s, z3.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z7.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z29.s, z9.s, z7.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z30.s, z10.s, z7.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "fmla z31.s, z11.s, z7.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z28.s, z12.s, z7.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z29.s, z13.s, z7.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z30.s, z14.s, z7.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "fmla z31.s, z15.s, z7.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z28.s, z8.s, z7.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z29.s, z9.s, z7.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z30.s, z10.s, z7.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z31.s, z11.s, z7.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z28.s, z12.s, z7.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "fmla z29.s, z13.s, z7.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "fmla z30.s, z14.s, z7.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "fmla z31.s, z15.s, z7.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "ld1rqw z7.s, p6/z, [a_ptr3]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z28.s, z12.s, z3.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z29.s, z13.s, z3.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z30.s, z14.s, z3.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z7.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z29.s, z9.s, z7.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z30.s, z10.s, z7.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "fmla z31.s, z11.s, z7.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z28.s, z12.s, z7.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z29.s, z13.s, z7.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z30.s, z14.s, z7.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "fmla z31.s, z15.s, z7.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z28.s, z8.s, z7.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z29.s, z9.s, z7.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z30.s, z10.s, z7.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z31.s, z11.s, z7.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1w z28.s, p0, [c_ptr3]\n"
+ "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
index 3fd738e673..9d88b60cee 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,7 +43,7 @@ public:
/* Kernel blocking parameters */
static int out_width()
{
- return svcnth() * 3;
+ return get_vector_length<__fp16>() * 3;
}
static int out_height()
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
index 92ec888244..517895ca7f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -48,24 +48,24 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel,
"mov z8.h, #0\n"
"ptrue p0.h\n"
"mov z9.h, #0\n"
- "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
"mov z10.h, #0\n"
- "ld1h z2.h, p0/z, [%[b_ptr]]\n"
"mov z11.h, #0\n"
- "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
"mov z12.h, #0\n"
- "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
+ "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
"mov z13.h, #0\n"
- "ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
+ "ld1h z2.h, p0/z, [%[b_ptr]]\n"
"mov z14.h, #0\n"
- "ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n"
+ "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
"mov z15.h, #0\n"
- "add %[a_ptr], %[a_ptr], #0x20\n"
+ "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
"mov z16.h, #0\n"
- "addvl %[b_ptr], %[b_ptr], #6\n"
+ "ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
"mov z17.h, #0\n"
+ "ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n"
"mov z18.h, #0\n"
+ "add %[a_ptr], %[a_ptr], #0x20\n"
"mov z19.h, #0\n"
+ "addvl %[b_ptr], %[b_ptr], #6\n"
"mov z20.h, #0\n"
"mov z21.h, #0\n"
"mov z22.h, #0\n"
@@ -199,37 +199,31 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel,
"fmla z30.h, z7.h, z1.h[6]\n"
"fmla z31.h, z7.h, z1.h[7]\n"
"fmla z8.h, z2.h, z0.h[0]\n"
- "st1h z8.h, p0, [%[c_ptr]]\n"
"fmla z9.h, z2.h, z0.h[1]\n"
"fmla z10.h, z2.h, z0.h[2]\n"
"fmla z11.h, z2.h, z0.h[3]\n"
"fmla z12.h, z2.h, z0.h[4]\n"
+ "st1h z8.h, p0, [%[c_ptr]]\n"
"fmla z13.h, z2.h, z0.h[5]\n"
"fmla z14.h, z2.h, z0.h[6]\n"
"fmla z15.h, z2.h, z0.h[7]\n"
"fmla z16.h, z3.h, z0.h[0]\n"
- "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z17.h, z3.h, z0.h[1]\n"
"fmla z18.h, z3.h, z0.h[2]\n"
"fmla z19.h, z3.h, z0.h[3]\n"
"fmla z20.h, z3.h, z0.h[4]\n"
+ "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z21.h, z3.h, z0.h[5]\n"
"fmla z22.h, z3.h, z0.h[6]\n"
"fmla z23.h, z3.h, z0.h[7]\n"
"fmla z24.h, z4.h, z0.h[0]\n"
- "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z25.h, z4.h, z0.h[1]\n"
- "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
"fmla z26.h, z4.h, z0.h[2]\n"
- "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
"fmla z27.h, z4.h, z0.h[3]\n"
- "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
"fmla z28.h, z4.h, z0.h[4]\n"
- "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z29.h, z4.h, z0.h[5]\n"
- "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n"
"fmla z30.h, z4.h, z0.h[6]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"fmla z31.h, z4.h, z0.h[7]\n"
"b 4f\n"
"3:\n"
@@ -260,39 +254,39 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel,
"fmla z30.h, z4.h, z0.h[6]\n"
"fmla z31.h, z4.h, z0.h[7]\n"
"fmla z8.h, z5.h, z1.h[0]\n"
- "st1h z8.h, p0, [%[c_ptr]]\n"
"fmla z9.h, z5.h, z1.h[1]\n"
"fmla z10.h, z5.h, z1.h[2]\n"
"fmla z11.h, z5.h, z1.h[3]\n"
"fmla z12.h, z5.h, z1.h[4]\n"
+ "st1h z8.h, p0, [%[c_ptr]]\n"
"fmla z13.h, z5.h, z1.h[5]\n"
"fmla z14.h, z5.h, z1.h[6]\n"
"fmla z15.h, z5.h, z1.h[7]\n"
"fmla z16.h, z6.h, z1.h[0]\n"
- "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z17.h, z6.h, z1.h[1]\n"
"fmla z18.h, z6.h, z1.h[2]\n"
"fmla z19.h, z6.h, z1.h[3]\n"
"fmla z20.h, z6.h, z1.h[4]\n"
+ "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z21.h, z6.h, z1.h[5]\n"
"fmla z22.h, z6.h, z1.h[6]\n"
"fmla z23.h, z6.h, z1.h[7]\n"
"fmla z24.h, z7.h, z1.h[0]\n"
- "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z25.h, z7.h, z1.h[1]\n"
- "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
"fmla z26.h, z7.h, z1.h[2]\n"
- "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
"fmla z27.h, z7.h, z1.h[3]\n"
- "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
"fmla z28.h, z7.h, z1.h[4]\n"
- "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z29.h, z7.h, z1.h[5]\n"
- "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n"
"fmla z30.h, z7.h, z1.h[6]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"fmla z31.h, z7.h, z1.h[7]\n"
"4:\n"
+ "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
+ "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
+ "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
+ "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n"
+ "addvl %[c_ptr], %[c_ptr], #16\n"
"st1h z26.h, p0, [%[c_ptr], #-8, MUL VL]\n"
"st1h z11.h, p0, [%[c_ptr], #-7, MUL VL]\n"
"st1h z19.h, p0, [%[c_ptr], #-6, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
index b2327f3070..2e8f261fe1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,7 +43,7 @@ public:
/* Kernel blocking parameters */
static int out_width()
{
- return svcntw() * 3;
+ return get_vector_length<float>() * 3;
}
static int out_height()
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
index bb08fc7cb0..88c984018e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -48,22 +48,22 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl
"mov z8.s, #0\n"
"ptrue p0.s\n"
"mov z9.s, #0\n"
- "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
"mov z10.s, #0\n"
- "ld1w z4.s, p0/z, [%[b_ptr]]\n"
"mov z11.s, #0\n"
- "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
"mov z12.s, #0\n"
- "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
"mov z13.s, #0\n"
- "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr]]\n"
"mov z14.s, #0\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
+ "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
"mov z15.s, #0\n"
- "addvl %[b_ptr], %[b_ptr], #3\n"
+ "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
"mov z16.s, #0\n"
+ "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
"mov z17.s, #0\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
"mov z18.s, #0\n"
+ "addvl %[b_ptr], %[b_ptr], #3\n"
"mov z19.s, #0\n"
"mov z20.s, #0\n"
"mov z21.s, #0\n"
@@ -204,37 +204,31 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl
"fmla z31.s, z6.s, z3.s[3]\n"
"ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
"fmla z8.s, z4.s, z0.s[0]\n"
- "st1w z8.s, p0, [%[c_ptr]]\n"
"fmla z9.s, z4.s, z0.s[1]\n"
"fmla z10.s, z4.s, z0.s[2]\n"
"fmla z11.s, z4.s, z0.s[3]\n"
"fmla z20.s, z4.s, z1.s[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
"fmla z21.s, z4.s, z1.s[1]\n"
"fmla z22.s, z4.s, z1.s[2]\n"
"fmla z23.s, z4.s, z1.s[3]\n"
"fmla z12.s, z5.s, z0.s[0]\n"
- "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z13.s, z5.s, z0.s[1]\n"
"fmla z14.s, z5.s, z0.s[2]\n"
"fmla z15.s, z5.s, z0.s[3]\n"
"fmla z24.s, z5.s, z1.s[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z25.s, z5.s, z1.s[1]\n"
"fmla z26.s, z5.s, z1.s[2]\n"
"fmla z27.s, z5.s, z1.s[3]\n"
"fmla z16.s, z6.s, z0.s[0]\n"
- "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z17.s, z6.s, z0.s[1]\n"
- "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"fmla z18.s, z6.s, z0.s[2]\n"
- "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
"fmla z19.s, z6.s, z0.s[3]\n"
- "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
"fmla z28.s, z6.s, z1.s[0]\n"
- "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z29.s, z6.s, z1.s[1]\n"
- "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
"fmla z30.s, z6.s, z1.s[2]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"fmla z31.s, z6.s, z1.s[3]\n"
"b 4f\n"
"3:\n"
@@ -269,39 +263,39 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl
"fmla z31.s, z6.s, z1.s[3]\n"
"ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
"fmla z8.s, z4.s, z2.s[0]\n"
- "st1w z8.s, p0, [%[c_ptr]]\n"
"fmla z9.s, z4.s, z2.s[1]\n"
"fmla z10.s, z4.s, z2.s[2]\n"
"fmla z11.s, z4.s, z2.s[3]\n"
"fmla z20.s, z4.s, z3.s[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
"fmla z21.s, z4.s, z3.s[1]\n"
"fmla z22.s, z4.s, z3.s[2]\n"
"fmla z23.s, z4.s, z3.s[3]\n"
"fmla z12.s, z5.s, z2.s[0]\n"
- "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z13.s, z5.s, z2.s[1]\n"
"fmla z14.s, z5.s, z2.s[2]\n"
"fmla z15.s, z5.s, z2.s[3]\n"
"fmla z24.s, z5.s, z3.s[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z25.s, z5.s, z3.s[1]\n"
"fmla z26.s, z5.s, z3.s[2]\n"
"fmla z27.s, z5.s, z3.s[3]\n"
"fmla z16.s, z6.s, z2.s[0]\n"
- "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z17.s, z6.s, z2.s[1]\n"
- "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"fmla z18.s, z6.s, z2.s[2]\n"
- "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
"fmla z19.s, z6.s, z2.s[3]\n"
- "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
"fmla z28.s, z6.s, z3.s[0]\n"
- "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z29.s, z6.s, z3.s[1]\n"
- "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
"fmla z30.s, z6.s, z3.s[2]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"fmla z31.s, z6.s, z3.s[3]\n"
"4:\n"
+ "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
+ "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
+ "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
+ "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
+ "addvl %[c_ptr], %[c_ptr], #16\n"
"st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
"st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
"st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
index 91aa567d4a..67154e6a3f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,7 +43,7 @@ public:
/* Kernel blocking parameters */
static int out_width()
{
- return svcntw() * 3;
+ return get_vector_length<int32_t>() * 3;
}
static int out_height()
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
index 2e994a13f3..d679c211ef 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,22 +49,22 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel,
"mov z8.s, #0\n"
"ptrue p0.b\n"
"mov z9.s, #0\n"
- "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
"mov z10.s, #0\n"
- "ld1b z4.b, p0/z, [%[b_ptr]]\n"
"mov z11.s, #0\n"
- "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
"mov z12.s, #0\n"
- "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
"mov z13.s, #0\n"
- "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
+ "ld1b z4.b, p0/z, [%[b_ptr]]\n"
"mov z14.s, #0\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
+ "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
"mov z15.s, #0\n"
- "addvl %[b_ptr], %[b_ptr], #3\n"
+ "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
"mov z16.s, #0\n"
+ "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
"mov z17.s, #0\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
"mov z18.s, #0\n"
+ "addvl %[b_ptr], %[b_ptr], #3\n"
"mov z19.s, #0\n"
"mov z20.s, #0\n"
"mov z21.s, #0\n"
@@ -205,37 +205,31 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel,
"sdot z31.s, z6.b, z3.b[3]\n"
"ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
"sdot z8.s, z4.b, z0.b[0]\n"
- "st1w z8.s, p0, [%[c_ptr]]\n"
"sdot z9.s, z4.b, z0.b[1]\n"
"sdot z10.s, z4.b, z0.b[2]\n"
"sdot z11.s, z4.b, z0.b[3]\n"
"sdot z20.s, z4.b, z1.b[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
"sdot z21.s, z4.b, z1.b[1]\n"
"sdot z22.s, z4.b, z1.b[2]\n"
"sdot z23.s, z4.b, z1.b[3]\n"
"sdot z12.s, z5.b, z0.b[0]\n"
- "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"sdot z13.s, z5.b, z0.b[1]\n"
"sdot z14.s, z5.b, z0.b[2]\n"
"sdot z15.s, z5.b, z0.b[3]\n"
"sdot z24.s, z5.b, z1.b[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"sdot z25.s, z5.b, z1.b[1]\n"
"sdot z26.s, z5.b, z1.b[2]\n"
"sdot z27.s, z5.b, z1.b[3]\n"
"sdot z16.s, z6.b, z0.b[0]\n"
- "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"sdot z17.s, z6.b, z0.b[1]\n"
- "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"sdot z18.s, z6.b, z0.b[2]\n"
- "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
"sdot z19.s, z6.b, z0.b[3]\n"
- "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
"sdot z28.s, z6.b, z1.b[0]\n"
- "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"sdot z29.s, z6.b, z1.b[1]\n"
- "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
"sdot z30.s, z6.b, z1.b[2]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"sdot z31.s, z6.b, z1.b[3]\n"
"b 4f\n"
"3:\n"
@@ -270,39 +264,39 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel,
"sdot z31.s, z6.b, z1.b[3]\n"
"ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
"sdot z8.s, z4.b, z2.b[0]\n"
- "st1w z8.s, p0, [%[c_ptr]]\n"
"sdot z9.s, z4.b, z2.b[1]\n"
"sdot z10.s, z4.b, z2.b[2]\n"
"sdot z11.s, z4.b, z2.b[3]\n"
"sdot z20.s, z4.b, z3.b[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
"sdot z21.s, z4.b, z3.b[1]\n"
"sdot z22.s, z4.b, z3.b[2]\n"
"sdot z23.s, z4.b, z3.b[3]\n"
"sdot z12.s, z5.b, z2.b[0]\n"
- "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"sdot z13.s, z5.b, z2.b[1]\n"
"sdot z14.s, z5.b, z2.b[2]\n"
"sdot z15.s, z5.b, z2.b[3]\n"
"sdot z24.s, z5.b, z3.b[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"sdot z25.s, z5.b, z3.b[1]\n"
"sdot z26.s, z5.b, z3.b[2]\n"
"sdot z27.s, z5.b, z3.b[3]\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"sdot z17.s, z6.b, z2.b[1]\n"
- "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
- "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
"sdot z19.s, z6.b, z2.b[3]\n"
- "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
"sdot z28.s, z6.b, z3.b[0]\n"
- "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"sdot z29.s, z6.b, z3.b[1]\n"
- "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
"sdot z30.s, z6.b, z3.b[2]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"sdot z31.s, z6.b, z3.b[3]\n"
"4:\n"
+ "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
+ "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
+ "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
+ "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
+ "addvl %[c_ptr], %[c_ptr], #16\n"
"st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
"st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
"st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
index ef457e454f..628c5a868e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,7 +43,7 @@ public:
/* Kernel blocking parameters */
static int out_width()
{
- return svcntw() * 3;
+ return get_vector_length<uint32_t>() * 3;
}
static int out_height()
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp
new file mode 100644
index 0000000000..fcc80d9fe5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_native_fp32_mla_4VLx4(const float *, int, const float *, int ldb, float *, int, float, int, int, int);
+
+class native_fp32_mla_4VLx4
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, float, int, int, int);
+
+ /* Kernel blocking parameters */
+ static int out_height()
+ {
+ return 4;
+ }
+
+ static int out_width()
+ {
+ return get_vector_length<float>() * 4;
+ }
+
+ static int k_unroll()
+ {
+ return 1;
+ }
+
+
+
+ // Default to the generic kernel
+ kern_type kernel=sve_native_fp32_mla_4VLx4;
+
+ native_fp32_mla_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp
new file mode 100644
index 0000000000..6e225669fc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp
@@ -0,0 +1,2066 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+ const long loops_count = ((K + 4) / 8) - 1;
+ K -= loops_count * 8;
+ const long regs_count = (K / 4) - 1;
+ K -= (regs_count + 1) * 4;
+ const long leftovers = K;
+
+ for (int y=0; y<M; y+=4) {
+ const float * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(float);
+
+ float *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(float);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
+ const float *betaptr = &beta;
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = leftovers;
+ const float *a_ptr0 = a_ptr0_base;
+ const float *b_ptr0 = B + x0;
+ long ldbb = ldb * sizeof(float);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "mov z18.s, #0\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z19.s, #0\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "2:\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "mov z19.s, #0\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z20.s, #0\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z23.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "fmul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "fmul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "2:\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "mov z20.s, #0\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z24.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov z25.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov z26.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z27.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "fmul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "fmul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "fmul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "fmul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "2:\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "mov z20.s, #0\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "mov z21.s, #0\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z24.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z25.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov z26.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov z27.s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "mov z28.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "fmul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "fmul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "fmul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "fmul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "fmul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "fmul z28.s, p7/m, z28.s, z15.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "fmul z29.s, p7/m, z29.s, z15.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "fmul z30.s, p7/m, z30.s, z15.s\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "2:\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z28.s, z12.s, z3.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z29.s, z13.s, z3.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z30.s, z14.s, z3.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z7.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z29.s, z9.s, z7.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z30.s, z10.s, z7.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "fmla z31.s, z11.s, z7.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z28.s, z12.s, z7.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z29.s, z13.s, z7.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z30.s, z14.s, z7.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "fmla z31.s, z15.s, z7.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z28.s, z8.s, z7.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z29.s, z9.s, z7.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z30.s, z10.s, z7.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z31.s, z11.s, z7.s[2]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z28.s, z12.s, z7.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "fmla z29.s, z13.s, z7.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "fmla z30.s, z14.s, z7.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "fmla z31.s, z15.s, z7.s[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z28.s, z12.s, z3.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z29.s, z13.s, z3.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z30.s, z14.s, z3.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z7.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z29.s, z9.s, z7.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z30.s, z10.s, z7.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "fmla z31.s, z11.s, z7.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z28.s, z12.s, z7.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z29.s, z13.s, z7.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z30.s, z14.s, z7.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "fmla z31.s, z15.s, z7.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z28.s, z8.s, z7.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z29.s, z9.s, z7.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z30.s, z10.s, z7.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z31.s, z11.s, z7.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z28.s, z12.s, z7.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "fmla z29.s, z13.s, z7.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "fmla z30.s, z14.s, z7.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "fmla z31.s, z15.s, z7.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z7.s, p6/z, [a_ptr3]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z28.s, z12.s, z3.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z29.s, z13.s, z3.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z30.s, z14.s, z3.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z7.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z29.s, z9.s, z7.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z30.s, z10.s, z7.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "fmla z31.s, z11.s, z7.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z28.s, z12.s, z7.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z29.s, z13.s, z7.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z30.s, z14.s, z7.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "fmla z31.s, z15.s, z7.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z28.s, z8.s, z7.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z29.s, z9.s, z7.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z30.s, z10.s, z7.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z31.s, z11.s, z7.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1w z28.s, p0, [c_ptr3]\n"
+ "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp
new file mode 100644
index 0000000000..f5634e3618
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_native_s8s32_dot_4VLx4(const int8_t *, int, const int8_t *, int ldb, int32_t *, int, int32_t, int, int, int);
+
+class native_s8s32_dot_4VLx4
+{
+public:
+ typedef int8_t operand_type;
+ typedef int32_t result_type;
+
+ typedef void (*kern_type)(const int8_t *, int, const int8_t *, int ldb, int32_t *, int, int32_t, int, int, int);
+
+ /* Kernel blocking parameters */
+ static int out_height()
+ {
+ return 4;
+ }
+
+ static int out_width()
+ {
+ return get_vector_length<int32_t>() * 4;
+ }
+
+ static int k_unroll()
+ {
+ return 4;
+ }
+
+
+
+ // Default to the generic kernel
+ kern_type kernel=sve_native_s8s32_dot_4VLx4;
+
+ native_s8s32_dot_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
new file mode 100644
index 0000000000..9c02d95044
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
@@ -0,0 +1,4632 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int ldb, int32_t *C, int ldc, int32_t beta, int M, int N, int K) {
+ const long beta0 = (beta == 0);
+ const long loops_count = ((K + 16) / 32) - 1;
+ K -= loops_count * 32;
+ const long regs_count = (K / 16) - 1;
+ K -= (regs_count + 1) * 16;
+ const long leftovers = K;
+ const long blocks_count = K / 4;
+ const long odds_count = K - (blocks_count * 4);
+
+ for (int y=0; y<M; y+=4) {
+ const int8_t * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(int8_t);
+
+ int32_t *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(int32_t);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<int32_t>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<int32_t>()));
+ const int32_t *betaptr = &beta;
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = blocks_count;
+ long odds = odds_count;
+ const int8_t *a_ptr0 = a_ptr0_base;
+ const int8_t *b_ptr0 = B + x0;
+ const int8_t *b_ptr1 = b_ptr0 + ldb;
+ const int8_t *b_ptr2 = b_ptr1 + ldb;
+ const int8_t *b_ptr3 = b_ptr2 + ldb;
+ long ldbb = ldb * sizeof(int8_t) * 4;
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z18.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z19.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "2:\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z19.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z20.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mov z21.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z22.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z23.s, #0\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "2:\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z20.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z21.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mov z22.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z23.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z24.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z25.s, #0\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "mov z26.s, #0\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "mov z27.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "2:\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z20.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z21.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z22.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mov z23.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z24.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z25.s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "mov z26.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "mov z27.s, #0\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "mov z28.s, #0\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z28.s, p7/m, z28.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z29.s, p7/m, z29.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z30.s, p7/m, z30.s, z15.s\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "2:\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z28.s, z8.b, z7.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "sdot z31.s, z11.b, z7.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z28.s, z12.b, z7.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "sdot z31.s, z15.b, z7.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z28.s, z8.b, z7.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z30.s, z10.b, z7.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z31.s, z11.b, z7.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z28.s, z12.b, z7.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z29.s, z13.b, z7.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z30.s, z14.b, z7.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "sdot z31.s, z15.b, z7.b[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z28.s, z8.b, z7.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "sdot z31.s, z11.b, z7.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z28.s, z12.b, z7.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "sdot z31.s, z15.b, z7.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z28.s, z8.b, z7.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z30.s, z10.b, z7.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z31.s, z11.b, z7.b[2]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z28.s, z12.b, z7.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z29.s, z13.b, z7.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z30.s, z14.b, z7.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "sdot z31.s, z15.b, z7.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr3]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z28.s, z8.b, z7.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z30.s, z10.b, z7.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "sdot z31.s, z11.b, z7.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z28.s, z12.b, z7.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z30.s, z14.b, z7.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "sdot z31.s, z15.b, z7.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z28.s, z8.b, z7.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z30.s, z10.b, z7.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z31.s, z11.b, z7.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z28.s, z12.b, z7.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z29.s, z13.b, z7.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z30.s, z14.b, z7.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "sdot z31.s, z15.b, z7.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z28.s, z8.b, z7.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z30.s, z10.b, z7.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z31.s, z11.b, z7.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z28.s, z12.b, z7.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z30.s, z14.b, z7.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "sdot z31.s, z15.b, z7.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z28.s, z8.b, z7.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z30.s, z10.b, z7.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "sdot z31.s, z11.b, z7.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1w z28.s, p0, [c_ptr3]\n"
+ "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp
new file mode 100644
index 0000000000..f5ebad8565
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_native_u8u32_dot_4VLx4(const uint8_t *, int, const uint8_t *, int ldb, uint32_t *, int, uint32_t, int, int, int);
+
+class native_u8u32_dot_4VLx4
+{
+public:
+ typedef uint8_t operand_type;
+ typedef uint32_t result_type;
+
+ typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, int ldb, uint32_t *, int, uint32_t, int, int, int);
+
+ /* Kernel blocking parameters */
+ static int out_height()
+ {
+ return 4;
+ }
+
+ static int out_width()
+ {
+ return get_vector_length<uint32_t>() * 4;
+ }
+
+ static int k_unroll()
+ {
+ return 4;
+ }
+
+
+
+ // Default to the generic kernel
+ kern_type kernel=sve_native_u8u32_dot_4VLx4;
+
+ native_u8u32_dot_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
new file mode 100644
index 0000000000..7d89948dc1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
@@ -0,0 +1,4632 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int ldb, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) {
+ const long beta0 = (beta == 0u);
+ const long loops_count = ((K + 16) / 32) - 1;
+ K -= loops_count * 32;
+ const long regs_count = (K / 16) - 1;
+ K -= (regs_count + 1) * 16;
+ const long leftovers = K;
+ const long blocks_count = K / 4;
+ const long odds_count = K - (blocks_count * 4);
+
+ for (int y=0; y<M; y+=4) {
+ const uint8_t * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(uint8_t);
+
+ uint32_t *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(uint32_t);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<uint32_t>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<uint32_t>()));
+ const uint32_t *betaptr = &beta;
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = blocks_count;
+ long odds = odds_count;
+ const uint8_t *a_ptr0 = a_ptr0_base;
+ const uint8_t *b_ptr0 = B + x0;
+ const uint8_t *b_ptr1 = b_ptr0 + ldb;
+ const uint8_t *b_ptr2 = b_ptr1 + ldb;
+ const uint8_t *b_ptr3 = b_ptr2 + ldb;
+ long ldbb = ldb * sizeof(uint8_t) * 4;
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z18.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z19.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "2:\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z19.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z20.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mov z21.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z22.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z23.s, #0\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "2:\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z20.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z21.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mov z22.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z23.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z24.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z25.s, #0\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "mov z26.s, #0\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "mov z27.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "2:\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z20.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z21.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z22.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mov z23.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z24.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z25.s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "mov z26.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "mov z27.s, #0\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "mov z28.s, #0\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z28.s, p7/m, z28.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z29.s, p7/m, z29.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z30.s, p7/m, z30.s, z15.s\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "2:\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z28.s, z8.b, z7.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "udot z31.s, z11.b, z7.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z28.s, z12.b, z7.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "udot z31.s, z15.b, z7.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z28.s, z8.b, z7.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z30.s, z10.b, z7.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z31.s, z11.b, z7.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z28.s, z12.b, z7.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z29.s, z13.b, z7.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z30.s, z14.b, z7.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "udot z31.s, z15.b, z7.b[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z28.s, z8.b, z7.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "udot z31.s, z11.b, z7.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z28.s, z12.b, z7.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "udot z31.s, z15.b, z7.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z28.s, z8.b, z7.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z30.s, z10.b, z7.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z31.s, z11.b, z7.b[2]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z28.s, z12.b, z7.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z29.s, z13.b, z7.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z30.s, z14.b, z7.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "udot z31.s, z15.b, z7.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr3]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z28.s, z8.b, z7.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z30.s, z10.b, z7.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "udot z31.s, z11.b, z7.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z28.s, z12.b, z7.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z30.s, z14.b, z7.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "udot z31.s, z15.b, z7.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z28.s, z8.b, z7.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z30.s, z10.b, z7.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z31.s, z11.b, z7.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z28.s, z12.b, z7.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z29.s, z13.b, z7.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z30.s, z14.b, z7.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "udot z31.s, z15.b, z7.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z28.s, z8.b, z7.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z30.s, z10.b, z7.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z31.s, z11.b, z7.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z28.s, z12.b, z7.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z30.s, z14.b, z7.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "udot z31.s, z15.b, z7.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z28.s, z8.b, z7.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z30.s, z10.b, z7.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "udot z31.s, z11.b, z7.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1w z28.s, p0, [c_ptr3]\n"
+ "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp
new file mode 100644
index 0000000000..80b216ca14
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_smallK_fp32_mla_1VLx4(const float *, int, const float *, int ldb, float *, int, float, int, int, int);
+
+class smallK_fp32_mla_1VLx4
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, float, int, int, int);
+
+ /* Kernel blocking parameters */
+ static int out_height()
+ {
+ return 4;
+ }
+
+ static int out_width()
+ {
+ return get_vector_length<float>() * 1;
+ }
+
+ static int k_unroll()
+ {
+ return 1;
+ }
+
+
+
+ // Default to the generic kernel
+ kern_type kernel=sve_smallK_fp32_mla_1VLx4;
+
+ smallK_fp32_mla_1VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp
new file mode 100644
index 0000000000..e2cc1d14e2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp
@@ -0,0 +1,4264 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_smallK_fp32_mla_1VLx4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+
+ const long loops_count = M / 4;
+ const long oddrow_count = M % 4;
+ const long ldab = lda * sizeof(float);
+ const long ldcb = ldc * sizeof(float);
+ const long odd_depth = K % 4;
+ const float *betaptr = &beta;
+ long ldbb = ldb * sizeof(float);
+
+ for (int x0=0; x0<N; x0+=(get_vector_length<float>() * 1)) {
+ const long width = std::min((unsigned long)N-x0, (get_vector_length<float>() * 1));
+ long loops = loops_count;
+ long oddrows = oddrow_count;
+ long temp = 0;
+ const float *b_ptr0 = B + x0;
+
+ const float *a_ptr0 = A;
+
+ float *c_ptr0 = C + x0;
+
+ switch(K) {
+ case 1:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 5:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 6:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 7:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 8:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 9:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 10:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 11:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 12:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 13:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 14:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 15:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 16:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 17:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 18:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 19:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 20:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 21:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z24.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 22:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z24.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z25.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "fmla z28.s, z25.s, z0.s[1]\n"
+ "fmla z29.s, z25.s, z1.s[1]\n"
+ "fmla z30.s, z25.s, z2.s[1]\n"
+ "fmla z31.s, z25.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "fmla z28.s, z25.s, z1.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 23:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z24.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z25.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z26.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "fmla z28.s, z25.s, z0.s[1]\n"
+ "fmla z29.s, z25.s, z1.s[1]\n"
+ "fmla z30.s, z25.s, z2.s[1]\n"
+ "fmla z31.s, z25.s, z3.s[1]\n"
+ "fmla z28.s, z26.s, z0.s[2]\n"
+ "fmla z29.s, z26.s, z1.s[2]\n"
+ "fmla z30.s, z26.s, z2.s[2]\n"
+ "fmla z31.s, z26.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "fmla z28.s, z25.s, z1.s[1]\n"
+ "fmla z28.s, z26.s, z1.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ default:
+ case 24:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z24.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z25.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z26.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z27.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "fmla z28.s, z25.s, z0.s[1]\n"
+ "fmla z29.s, z25.s, z1.s[1]\n"
+ "fmla z30.s, z25.s, z2.s[1]\n"
+ "fmla z31.s, z25.s, z3.s[1]\n"
+ "fmla z28.s, z26.s, z0.s[2]\n"
+ "fmla z29.s, z26.s, z1.s[2]\n"
+ "fmla z30.s, z26.s, z2.s[2]\n"
+ "fmla z31.s, z26.s, z3.s[2]\n"
+ "fmla z28.s, z27.s, z0.s[3]\n"
+ "fmla z29.s, z27.s, z1.s[3]\n"
+ "fmla z30.s, z27.s, z2.s[3]\n"
+ "fmla z31.s, z27.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "fmla z28.s, z25.s, z1.s[1]\n"
+ "fmla z28.s, z26.s, z1.s[2]\n"
+ "fmla z28.s, z27.s, z1.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp
new file mode 100644
index 0000000000..aa2c522382
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_smallK_hybrid_fp32_mla_1VLx4(const float *, int, const float *, float *, int, float, int, int, int);
+
+class smallK_hybrid_fp32_mla_1VLx4
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, float *, int, float, int, int, int);
+
+ /* Kernel blocking parameters */
+ static int out_height()
+ {
+ return 4;
+ }
+
+ static int out_width()
+ {
+ return get_vector_length<float>() * 1;
+ }
+
+ static int k_unroll()
+ {
+ return 1;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 4, 1, 1> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_smallK_hybrid_fp32_mla_1VLx4;
+
+ smallK_hybrid_fp32_mla_1VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp
new file mode 100644
index 0000000000..3e7e713106
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp
@@ -0,0 +1,4004 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_smallK_hybrid_fp32_mla_1VLx4(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+
+ const long loops_count = M / 4;
+ const long oddrow_count = M % 4;
+ const long ldab = lda * sizeof(float);
+ const long ldcb = ldc * sizeof(float);
+ const int K_stride = K;
+ const long odd_depth = K % 4;
+ const float *betaptr = &beta;
+
+ for (int x0=0; x0<N; x0+=(get_vector_length<float>() * 1)) {
+ const long width = std::min((unsigned long)N-x0, (get_vector_length<float>() * 1));
+ long loops = loops_count;
+ long oddrows = oddrow_count;
+ long temp = 0;
+ const float *b_ptr0 = B + (K_stride * x0);
+
+ const float *a_ptr0 = A;
+
+ float *c_ptr0 = C + x0;
+
+ switch(K) {
+ case 1:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 5:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 6:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 7:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 8:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 9:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 10:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 11:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 12:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 13:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 14:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 15:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 16:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 17:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 18:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 19:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 20:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 21:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 22:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z25.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "fmla z28.s, z25.s, z0.s[1]\n"
+ "fmla z29.s, z25.s, z1.s[1]\n"
+ "fmla z30.s, z25.s, z2.s[1]\n"
+ "fmla z31.s, z25.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "fmla z28.s, z25.s, z1.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 23:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z25.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z26.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "fmla z28.s, z25.s, z0.s[1]\n"
+ "fmla z29.s, z25.s, z1.s[1]\n"
+ "fmla z30.s, z25.s, z2.s[1]\n"
+ "fmla z31.s, z25.s, z3.s[1]\n"
+ "fmla z28.s, z26.s, z0.s[2]\n"
+ "fmla z29.s, z26.s, z1.s[2]\n"
+ "fmla z30.s, z26.s, z2.s[2]\n"
+ "fmla z31.s, z26.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "fmla z28.s, z25.s, z1.s[1]\n"
+ "fmla z28.s, z26.s, z1.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ default:
+ case 24:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z25.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z26.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z27.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "fmla z28.s, z25.s, z0.s[1]\n"
+ "fmla z29.s, z25.s, z1.s[1]\n"
+ "fmla z30.s, z25.s, z2.s[1]\n"
+ "fmla z31.s, z25.s, z3.s[1]\n"
+ "fmla z28.s, z26.s, z0.s[2]\n"
+ "fmla z29.s, z26.s, z1.s[2]\n"
+ "fmla z30.s, z26.s, z2.s[2]\n"
+ "fmla z31.s, z26.s, z3.s[2]\n"
+ "fmla z28.s, z27.s, z0.s[3]\n"
+ "fmla z29.s, z27.s, z1.s[3]\n"
+ "fmla z30.s, z27.s, z2.s[3]\n"
+ "fmla z31.s, z27.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "fmla z28.s, z25.s, z1.s[1]\n"
+ "fmla z28.s, z26.s, z1.s[2]\n"
+ "fmla z28.s, z27.s, z1.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp
new file mode 100644
index 0000000000..fcdca59bdd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp
@@ -0,0 +1,1660 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+template<>
+inline void MergeResults<12, 8, false>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta)
+{
+ const float *inptr = in;
+
+ for (int y=y0; y<ymax; y+=8) {
+ float *outptr0 = out + (y * ldout) + x0;
+ float *outptr1 = outptr0 + ldout;
+ float *outptr2 = outptr1 + ldout;
+ float *outptr3 = outptr2 + ldout;
+ float *outptr4 = outptr3 + ldout;
+ float *outptr5 = outptr4 + ldout;
+ float *outptr6 = outptr5 + ldout;
+ float *outptr7 = outptr6 + ldout;
+
+ const int height = ymax - y;
+
+ for (int i=x0; i<xmax; i+=12) {
+ if (beta==0.0f)
+ {
+ switch(height) {
+ case 1:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]);
+ outptr0++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q4, [%[inptr]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q5, [%[inptr], #0x10]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr0], #0x10]\n"
+ "ldr q6, [%[inptr], #0x20]\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x20]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 2:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]);
+ outptr1++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q4, [%[inptr]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q6, [%[inptr], #0x10]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x10]\n"
+ "ldr q7, [%[inptr], #0x40]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x10]\n"
+ "ldr q4, [%[inptr], #0x20]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x20]\n"
+ "ldr q5, [%[inptr], #0x50]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x20]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 3:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]);
+ outptr2++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q4, [%[inptr]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q7, [%[inptr], #0x10]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr0], #0x10]\n"
+ "ldr q4, [%[inptr], #0x40]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr1], #0x10]\n"
+ "ldr q5, [%[inptr], #0x70]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr2], #0x10]\n"
+ "ldr q6, [%[inptr], #0x20]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x20]\n"
+ "ldr q7, [%[inptr], #0x50]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x20]\n"
+ "ldr q4, [%[inptr], #0x80]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr2], #0x20]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 4:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]);
+ outptr3++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q4, [%[inptr]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q4, [%[inptr], #0x10]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x10]\n"
+ "ldr q5, [%[inptr], #0x40]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x10]\n"
+ "ldr q6, [%[inptr], #0x70]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x10]\n"
+ "ldr q7, [%[inptr], #0xa0]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x10]\n"
+ "ldr q4, [%[inptr], #0x20]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x20]\n"
+ "ldr q5, [%[inptr], #0x50]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x20]\n"
+ "ldr q6, [%[inptr], #0x80]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x20]\n"
+ "ldr q7, [%[inptr], #0xb0]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x20]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 5:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]);
+ outptr4++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q4, [%[inptr]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q4, [%[inptr], #0xc0]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4]]\n"
+ "ldr q5, [%[inptr], #0x10]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr0], #0x10]\n"
+ "ldr q6, [%[inptr], #0x40]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr1], #0x10]\n"
+ "ldr q7, [%[inptr], #0x70]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr2], #0x10]\n"
+ "ldr q4, [%[inptr], #0xa0]\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr3], #0x10]\n"
+ "ldr q5, [%[inptr], #0xd0]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr4], #0x10]\n"
+ "ldr q6, [%[inptr], #0x20]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x20]\n"
+ "ldr q7, [%[inptr], #0x50]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x20]\n"
+ "ldr q4, [%[inptr], #0x80]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr2], #0x20]\n"
+ "ldr q5, [%[inptr], #0xb0]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr3], #0x20]\n"
+ "ldr q6, [%[inptr], #0xe0]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr4], #0x20]\n"
+ "add %[outptr4], %[outptr4], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 6:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 60]);
+ outptr5++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q4, [%[inptr]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q4, [%[inptr], #0xc0]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4]]\n"
+ "ldr q5, [%[inptr], #0xf0]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5]]\n"
+ "ldr q6, [%[inptr], #0x10]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x10]\n"
+ "ldr q7, [%[inptr], #0x40]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x10]\n"
+ "ldr q4, [%[inptr], #0x70]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr2], #0x10]\n"
+ "ldr q5, [%[inptr], #0xa0]\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr3], #0x10]\n"
+ "ldr q6, [%[inptr], #0xd0]\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr4], #0x10]\n"
+ "ldr q7, [%[inptr], #0x100]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr5], #0x10]\n"
+ "ldr q4, [%[inptr], #0x20]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x20]\n"
+ "ldr q5, [%[inptr], #0x50]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x20]\n"
+ "ldr q6, [%[inptr], #0x80]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x20]\n"
+ "ldr q7, [%[inptr], #0xb0]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x20]\n"
+ "ldr q4, [%[inptr], #0xe0]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4], #0x20]\n"
+ "ldr q5, [%[inptr], #0x110]\n"
+ "add %[outptr4], %[outptr4], #0x30\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5], #0x20]\n"
+ "add %[outptr5], %[outptr5], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 7:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 60]);
+ outptr5++;
+ *outptr6 = (alpha * inptr[xi + 72]);
+ outptr6++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q4, [%[inptr]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q4, [%[inptr], #0xc0]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4]]\n"
+ "ldr q5, [%[inptr], #0xf0]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5]]\n"
+ "ldr q6, [%[inptr], #0x120]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr6]]\n"
+ "ldr q7, [%[inptr], #0x10]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr0], #0x10]\n"
+ "ldr q4, [%[inptr], #0x40]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr1], #0x10]\n"
+ "ldr q5, [%[inptr], #0x70]\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr2], #0x10]\n"
+ "ldr q6, [%[inptr], #0xa0]\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr3], #0x10]\n"
+ "ldr q7, [%[inptr], #0xd0]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr4], #0x10]\n"
+ "ldr q4, [%[inptr], #0x100]\n"
+ "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr5], #0x10]\n"
+ "ldr q5, [%[inptr], #0x130]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr6], #0x10]\n"
+ "ldr q6, [%[inptr], #0x20]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x20]\n"
+ "ldr q7, [%[inptr], #0x50]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x20]\n"
+ "ldr q4, [%[inptr], #0x80]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr2], #0x20]\n"
+ "ldr q5, [%[inptr], #0xb0]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr3], #0x20]\n"
+ "ldr q6, [%[inptr], #0xe0]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr4], #0x20]\n"
+ "ldr q7, [%[inptr], #0x110]\n"
+ "add %[outptr4], %[outptr4], #0x30\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr5], #0x20]\n"
+ "ldr q4, [%[inptr], #0x140]\n"
+ "add %[outptr5], %[outptr5], #0x30\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr6], #0x20]\n"
+ "add %[outptr6], %[outptr6], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ default:
+ case 8:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 60]);
+ outptr5++;
+ *outptr6 = (alpha * inptr[xi + 72]);
+ outptr6++;
+ *outptr7 = (alpha * inptr[xi + 84]);
+ outptr7++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q4, [%[inptr]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q4, [%[inptr], #0xc0]\n"
+ "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4]]\n"
+ "ldr q5, [%[inptr], #0xf0]\n"
+ "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5]]\n"
+ "ldr q6, [%[inptr], #0x120]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr6]]\n"
+ "ldr q7, [%[inptr], #0x150]\n"
+ "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr7]]\n"
+ "ldr q4, [%[inptr], #0x10]\n"
+ "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x10]\n"
+ "ldr q5, [%[inptr], #0x40]\n"
+ "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x10]\n"
+ "ldr q6, [%[inptr], #0x70]\n"
+ "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x10]\n"
+ "ldr q7, [%[inptr], #0xa0]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x10]\n"
+ "ldr q4, [%[inptr], #0xd0]\n"
+ "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4], #0x10]\n"
+ "ldr q5, [%[inptr], #0x100]\n"
+ "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5], #0x10]\n"
+ "ldr q6, [%[inptr], #0x130]\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr6], #0x10]\n"
+ "ldr q7, [%[inptr], #0x160]\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr7], #0x10]\n"
+ "ldr q4, [%[inptr], #0x20]\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x20]\n"
+ "ldr q5, [%[inptr], #0x50]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x20]\n"
+ "ldr q6, [%[inptr], #0x80]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x20]\n"
+ "ldr q7, [%[inptr], #0xb0]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x20]\n"
+ "ldr q4, [%[inptr], #0xe0]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4], #0x20]\n"
+ "ldr q5, [%[inptr], #0x110]\n"
+ "add %[outptr4], %[outptr4], #0x30\n"
+ "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5], #0x20]\n"
+ "ldr q6, [%[inptr], #0x140]\n"
+ "add %[outptr5], %[outptr5], #0x30\n"
+ "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr6], #0x20]\n"
+ "ldr q7, [%[inptr], #0x170]\n"
+ "add %[outptr6], %[outptr6], #0x30\n"
+ "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr7], #0x20]\n"
+ "add %[outptr7], %[outptr7], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+
+ }
+ }
+ else
+ {
+ switch(height) {
+ case 1:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q8, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr]]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q9, [%[outptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x10]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr0], #0x10]\n"
+ "ldr q10, [%[outptr0], #0x20]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x20]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x20]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 2:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q8, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr]]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q9, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q10, [%[outptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x10]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x10]\n"
+ "ldr q11, [%[outptr1], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x40]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x10]\n"
+ "ldr q8, [%[outptr0], #0x20]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x20]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x20]\n"
+ "ldr q9, [%[outptr1], #0x20]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x50]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x20]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 3:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+ outptr2++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q8, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr]]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q9, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q10, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q11, [%[outptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x10]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr0], #0x10]\n"
+ "ldr q8, [%[outptr1], #0x10]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x40]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr1], #0x10]\n"
+ "ldr q9, [%[outptr2], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x70]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr2], #0x10]\n"
+ "ldr q10, [%[outptr0], #0x20]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x20]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x20]\n"
+ "ldr q11, [%[outptr1], #0x20]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x50]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x20]\n"
+ "ldr q8, [%[outptr2], #0x20]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x80]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr2], #0x20]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 4:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+ outptr3++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q8, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr]]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q9, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q10, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q11, [%[outptr3]]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q8, [%[outptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x10]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x10]\n"
+ "ldr q9, [%[outptr1], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x40]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x10]\n"
+ "ldr q10, [%[outptr2], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x70]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x10]\n"
+ "ldr q11, [%[outptr3], #0x10]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0xa0]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x10]\n"
+ "ldr q8, [%[outptr0], #0x20]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x20]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x20]\n"
+ "ldr q9, [%[outptr1], #0x20]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x50]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x20]\n"
+ "ldr q10, [%[outptr2], #0x20]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x80]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x20]\n"
+ "ldr q11, [%[outptr3], #0x20]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0xb0]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x20]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 5:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+ outptr4++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q8, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr]]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q9, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q10, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q11, [%[outptr3]]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q8, [%[outptr4]]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0xc0]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4]]\n"
+ "ldr q9, [%[outptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x10]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr0], #0x10]\n"
+ "ldr q10, [%[outptr1], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x40]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr1], #0x10]\n"
+ "ldr q11, [%[outptr2], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x70]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr2], #0x10]\n"
+ "ldr q8, [%[outptr3], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0xa0]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr3], #0x10]\n"
+ "ldr q9, [%[outptr4], #0x10]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0xd0]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr4], #0x10]\n"
+ "ldr q10, [%[outptr0], #0x20]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x20]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x20]\n"
+ "ldr q11, [%[outptr1], #0x20]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x50]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x20]\n"
+ "ldr q8, [%[outptr2], #0x20]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x80]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr2], #0x20]\n"
+ "ldr q9, [%[outptr3], #0x20]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0xb0]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr3], #0x20]\n"
+ "ldr q10, [%[outptr4], #0x20]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0xe0]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr4], #0x20]\n"
+ "add %[outptr4], %[outptr4], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 6:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+ outptr5++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q8, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr]]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q9, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q10, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q11, [%[outptr3]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q8, [%[outptr4]]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0xc0]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4]]\n"
+ "ldr q9, [%[outptr5]]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0xf0]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5]]\n"
+ "ldr q10, [%[outptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x10]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x10]\n"
+ "ldr q11, [%[outptr1], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x40]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x10]\n"
+ "ldr q8, [%[outptr2], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x70]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr2], #0x10]\n"
+ "ldr q9, [%[outptr3], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0xa0]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr3], #0x10]\n"
+ "ldr q10, [%[outptr4], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0xd0]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr4], #0x10]\n"
+ "ldr q11, [%[outptr5], #0x10]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x100]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr5], #0x10]\n"
+ "ldr q8, [%[outptr0], #0x20]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x20]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x20]\n"
+ "ldr q9, [%[outptr1], #0x20]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x50]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x20]\n"
+ "ldr q10, [%[outptr2], #0x20]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x80]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x20]\n"
+ "ldr q11, [%[outptr3], #0x20]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0xb0]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x20]\n"
+ "ldr q8, [%[outptr4], #0x20]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0xe0]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4], #0x20]\n"
+ "ldr q9, [%[outptr5], #0x20]\n"
+ "add %[outptr4], %[outptr4], #0x30\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x110]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5], #0x20]\n"
+ "add %[outptr5], %[outptr5], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ case 7:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+ outptr5++;
+ *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+ outptr6++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q8, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr]]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q9, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q10, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q11, [%[outptr3]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q8, [%[outptr4]]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0xc0]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4]]\n"
+ "ldr q9, [%[outptr5]]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0xf0]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5]]\n"
+ "ldr q10, [%[outptr6]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x120]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr6]]\n"
+ "ldr q11, [%[outptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x10]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr0], #0x10]\n"
+ "ldr q8, [%[outptr1], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x40]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr1], #0x10]\n"
+ "ldr q9, [%[outptr2], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x70]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr2], #0x10]\n"
+ "ldr q10, [%[outptr3], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0xa0]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr3], #0x10]\n"
+ "ldr q11, [%[outptr4], #0x10]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0xd0]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr4], #0x10]\n"
+ "ldr q8, [%[outptr5], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x100]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr5], #0x10]\n"
+ "ldr q9, [%[outptr6], #0x10]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x130]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr6], #0x10]\n"
+ "ldr q10, [%[outptr0], #0x20]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x20]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr0], #0x20]\n"
+ "ldr q11, [%[outptr1], #0x20]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x50]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr1], #0x20]\n"
+ "ldr q8, [%[outptr2], #0x20]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x80]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr2], #0x20]\n"
+ "ldr q9, [%[outptr3], #0x20]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0xb0]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr3], #0x20]\n"
+ "ldr q10, [%[outptr4], #0x20]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0xe0]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr4], #0x20]\n"
+ "ldr q11, [%[outptr5], #0x20]\n"
+ "add %[outptr4], %[outptr4], #0x30\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x110]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr5], #0x20]\n"
+ "ldr q8, [%[outptr6], #0x20]\n"
+ "add %[outptr5], %[outptr5], #0x30\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x140]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr6], #0x20]\n"
+ "add %[outptr6], %[outptr6], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+ default:
+ case 8:
+ {
+ if ((i+11) >= xmax)
+ {
+ for (int xi=0; xi<12; xi++)
+ {
+ if ((i+xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+ outptr5++;
+ *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+ outptr6++;
+ *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+ outptr7++;
+ }
+ }
+ inptr += 96;
+ } else {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile (
+ "ldr q8, [%[outptr0]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr]]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0]]\n"
+ "ldr q9, [%[outptr1]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x30]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1]]\n"
+ "ldr q10, [%[outptr2]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x60]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2]]\n"
+ "ldr q11, [%[outptr3]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x90]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3]]\n"
+ "ldr q8, [%[outptr4]]\n"
+ "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0xc0]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4]]\n"
+ "ldr q9, [%[outptr5]]\n"
+ "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0xf0]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5]]\n"
+ "ldr q10, [%[outptr6]]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x120]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr6]]\n"
+ "ldr q11, [%[outptr7]]\n"
+ "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x150]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr7]]\n"
+ "ldr q8, [%[outptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x10]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x10]\n"
+ "ldr q9, [%[outptr1], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x40]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x10]\n"
+ "ldr q10, [%[outptr2], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x70]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x10]\n"
+ "ldr q11, [%[outptr3], #0x10]\n"
+ "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0xa0]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x10]\n"
+ "ldr q8, [%[outptr4], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0xd0]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4], #0x10]\n"
+ "ldr q9, [%[outptr5], #0x10]\n"
+ "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x100]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5], #0x10]\n"
+ "ldr q10, [%[outptr6], #0x10]\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x130]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr6], #0x10]\n"
+ "ldr q11, [%[outptr7], #0x10]\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x160]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr7], #0x10]\n"
+ "ldr q8, [%[outptr0], #0x20]\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0x20]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr0], #0x20]\n"
+ "ldr q9, [%[outptr1], #0x20]\n"
+ "add %[outptr0], %[outptr0], #0x30\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x50]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr1], #0x20]\n"
+ "ldr q10, [%[outptr2], #0x20]\n"
+ "add %[outptr1], %[outptr1], #0x30\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x80]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr2], #0x20]\n"
+ "ldr q11, [%[outptr3], #0x20]\n"
+ "add %[outptr2], %[outptr2], #0x30\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0xb0]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr3], #0x20]\n"
+ "ldr q8, [%[outptr4], #0x20]\n"
+ "add %[outptr3], %[outptr3], #0x30\n"
+ "fmul v8.4s, v8.4s, %[beta].s[0]\n"
+ "ldr q4, [%[inptr], #0xe0]\n"
+ "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
+ "str q8, [%[outptr4], #0x20]\n"
+ "ldr q9, [%[outptr5], #0x20]\n"
+ "add %[outptr4], %[outptr4], #0x30\n"
+ "fmul v9.4s, v9.4s, %[beta].s[0]\n"
+ "ldr q5, [%[inptr], #0x110]\n"
+ "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
+ "str q9, [%[outptr5], #0x20]\n"
+ "ldr q10, [%[outptr6], #0x20]\n"
+ "add %[outptr5], %[outptr5], #0x30\n"
+ "fmul v10.4s, v10.4s, %[beta].s[0]\n"
+ "ldr q6, [%[inptr], #0x140]\n"
+ "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
+ "str q10, [%[outptr6], #0x20]\n"
+ "ldr q11, [%[outptr7], #0x20]\n"
+ "add %[outptr6], %[outptr6], #0x30\n"
+ "fmul v11.4s, v11.4s, %[beta].s[0]\n"
+ "ldr q7, [%[inptr], #0x170]\n"
+ "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
+ "str q11, [%[outptr7], #0x20]\n"
+ "add %[outptr7], %[outptr7], #0x30\n"
+ "add %[inptr], %[inptr], #0x180\n"
+ : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+ [inptr] "+r" (inptr)
+ : [alpha] "w" (alpha), [beta] "w" (beta)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
+ );
+ }
+ }
+ break;
+
+
+ }
+ }
+ }
+ }
+}
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp
index e422b91c83..0330783a0b 100644
--- a/src/core/NEON/kernels/arm_gemm/transform.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transform.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -81,11 +81,14 @@ struct TransformImpl {
}
}
// "row" tail - row is out of range so fill with zeros always.
- for (int row = 0; row < blank_rows; row++) {
- for (int col=0; col < (fill_cols + blank_cols); col++) {
- *out++ = static_cast<TOut>(0);
- }
+ TOut zeroval = static_cast<TOut>(0);
+ int pads = blank_rows * (fill_cols + blank_cols);
+
+ for (int i=0; i<pads; i++) {
+ out[i] = zeroval;
}
+
+ out += pads;
}
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
index 347eafb56a..0648ff6335 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,7 +23,7 @@
*/
#pragma once
-#ifdef __aarch64__
+#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
#include <arm_neon.h>
@@ -173,4 +173,4 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *
}
}
-#endif // __aarch64__
+#endif // __aarch64__ && !__ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
index fc1f2c24f4..e1ebba077b 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,17 +23,14 @@
*/
#include "a32_interleave_6way_32bit.hpp"
#include "a32_transpose_interleave_8way_32bit.hpp"
-#ifdef __ARM_FEATURE_SVE
-#include "sve_interleave_8way_32bit.hpp"
-#include "sve_interleave_8way_block2_32bit.hpp"
-#include "sve_interleave_8way_block4_8bit.hpp"
-#else
-#include "a64_interleave_8way_32bit.hpp"
-#endif
#include "a64_block16_interleave4_8bit.hpp"
#include "a64_interleave_8way_16bit.hpp"
+#include "a64_interleave_8way_32bit.hpp"
#include "a64_interleave_8way_half_to_float.hpp"
#include "a64_transpose_interleave_12way_16bit.hpp"
#include "a64_transpose_interleave_12way_half_to_float.hpp"
#include "a64_transpose_interleave_24way_16bit.hpp"
-#include "transpose_interleave_common.hpp"
+#include "sve_interleave_8way_32bit.hpp"
+#include "sve_interleave_8way_block2_32bit.hpp"
+#include "sve_interleave_8way_block4_8bit.hpp"
+#include "transpose_interleave_common.hpp" \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
index 752e837f8d..07c8219c1b 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,7 +41,7 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *
long outpos = 0;
uint32_t *outptr = master_outptr;
- master_outptr += outwidth;
+ master_outptr += (outwidth * 1);
const uint32_t *inptr0 = inptr + y * ldin + k0;
const uint32_t *inptr1 = inptr0 + ldin;
@@ -60,52 +60,53 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z4.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
"zip1 z8.s, z0.s, z4.s\n"
+ "incw %[inpos], all, mul #1\n"
"zip2 z9.s, z0.s, z4.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
"zip1 z0.s, z8.s, z4.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z1.s, z8.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z2.s, z9.s, z4.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip2 z3.s, z9.s, z4.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
"incw %[outpos], all, mul #1\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
"zip2 z9.s, z0.s, z4.s\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
"zip1 z10.s, z1.s, z4.s\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
"zip2 z11.s, z1.s, z4.s\n"
- "st1w z8.s, p0, [%[outptr]]\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
"zip1 z12.s, z2.s, z4.s\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z4.s\n"
"incw %[outpos], all, mul #1\n"
+ "zip2 z13.s, z2.s, z4.s\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
"zip1 z14.s, z3.s, z4.s\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
"zip2 z15.s, z3.s, z4.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
@@ -115,60 +116,62 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z4.s, #0\n"
- "mov z14.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
"zip1 z8.s, z0.s, z4.s\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
"zip2 z9.s, z0.s, z4.s\n"
+ "incw %[inpos], all, mul #1\n"
"zip1 z10.s, z1.s, z4.s\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
"zip2 z11.s, z1.s, z4.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
"zip1 z0.s, z8.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z1.s, z8.s, z4.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z2.s, z9.s, z4.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip2 z3.s, z9.s, z4.s\n"
- "zip1 z4.s, z10.s, z14.s\n"
+ "incw %[outpos], all, mul #1\n"
+ "mov z14.s, #0\n"
"whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip2 z5.s, z10.s, z14.s\n"
+ "zip1 z4.s, z10.s, z14.s\n"
"incw %[outpos], all, mul #1\n"
+ "zip2 z5.s, z10.s, z14.s\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
"zip1 z6.s, z11.s, z14.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z7.s, z11.s, z14.s\n"
"zip1 z8.s, z0.s, z4.s\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
"zip2 z9.s, z0.s, z4.s\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
"zip1 z10.s, z1.s, z5.s\n"
- "incw %[outpos], all, mul #1\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
"zip2 z11.s, z1.s, z5.s\n"
- "st1w z8.s, p0, [%[outptr]]\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
"zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z13.s, z2.s, z6.s\n"
+ "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
"zip1 z14.s, z3.s, z7.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
@@ -178,63 +181,66 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z4.s, #0\n"
- "mov z14.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
"zip1 z8.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
"zip2 z9.s, z0.s, z4.s\n"
+ "ld1w z2.s, p0/z, [%[inptr2]]\n"
"zip1 z10.s, z1.s, z4.s\n"
+ "incw %[inpos], all, mul #1\n"
"zip2 z11.s, z1.s, z4.s\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
"zip1 z12.s, z2.s, z4.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
"zip2 z13.s, z2.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z4.s, z10.s, z14.s\n"
+ "addvl %[inptr2], %[inptr2], #1\n"
"zip1 z0.s, z8.s, z12.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z1.s, z8.s, z12.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z2.s, z9.s, z13.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip2 z3.s, z9.s, z13.s\n"
"incw %[outpos], all, mul #1\n"
+ "mov z14.s, #0\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "zip1 z4.s, z10.s, z14.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z5.s, z10.s, z14.s\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
"zip1 z6.s, z11.s, z14.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z7.s, z11.s, z14.s\n"
"zip1 z8.s, z0.s, z4.s\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
"zip2 z9.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
"zip1 z10.s, z1.s, z5.s\n"
- "st1w z8.s, p0, [%[outptr]]\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
"zip2 z11.s, z1.s, z5.s\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
"zip1 z12.s, z2.s, z6.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z13.s, z2.s, z6.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
+ "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
"zip1 z14.s, z3.s, z7.s\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z15.s, z3.s, z7.s\n"
"incw %[outpos], all, mul #1\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
+ "zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
@@ -244,65 +250,69 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z4.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
"zip1 z8.s, z0.s, z4.s\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
"zip2 z9.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
+ "ld1w z2.s, p0/z, [%[inptr2]]\n"
"zip1 z10.s, z1.s, z4.s\n"
+ "ld1w z3.s, p0/z, [%[inptr3]]\n"
"zip2 z11.s, z1.s, z4.s\n"
+ "incw %[inpos], all, mul #1\n"
"zip1 z12.s, z2.s, z4.s\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
"zip2 z13.s, z2.s, z4.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
"zip1 z14.s, z3.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
+ "addvl %[inptr2], %[inptr2], #1\n"
"zip2 z15.s, z3.s, z4.s\n"
+ "addvl %[inptr3], %[inptr3], #1\n"
"zip1 z0.s, z8.s, z12.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z1.s, z8.s, z12.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z2.s, z9.s, z13.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip2 z3.s, z9.s, z13.s\n"
"incw %[outpos], all, mul #1\n"
"zip1 z4.s, z10.s, z14.s\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
"zip2 z5.s, z10.s, z14.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
"whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
+ "zip2 z7.s, z11.s, z15.s\n"
"incw %[outpos], all, mul #1\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
"zip2 z9.s, z0.s, z4.s\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
"zip1 z10.s, z1.s, z5.s\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
"zip2 z11.s, z1.s, z5.s\n"
- "st1w z8.s, p0, [%[outptr]]\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
"zip1 z12.s, z2.s, z6.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z13.s, z2.s, z6.s\n"
- "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
"zip1 z14.s, z3.s, z7.s\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z15.s, z3.s, z7.s\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
@@ -312,66 +322,71 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z5.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
"incw %[inpos], all, mul #1\n"
"zip1 z10.s, z1.s, z5.s\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "ld1w z2.s, p0/z, [%[inptr2]]\n"
"zip2 z11.s, z1.s, z5.s\n"
- "incw %[outpos], all, mul #1\n"
+ "ld1w z3.s, p0/z, [%[inptr3]]\n"
+ "zip1 z12.s, z2.s, z5.s\n"
+ "ld1w z4.s, p0/z, [%[inptr4]]\n"
"zip1 z8.s, z0.s, z4.s\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
"zip2 z9.s, z0.s, z4.s\n"
- "zip1 z12.s, z2.s, z5.s\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
"zip2 z13.s, z2.s, z5.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "addvl %[inptr2], %[inptr2], #1\n"
"zip1 z14.s, z3.s, z5.s\n"
- "incw %[outpos], all, mul #1\n"
+ "addvl %[inptr3], %[inptr3], #1\n"
"zip2 z15.s, z3.s, z5.s\n"
+ "addvl %[inptr4], %[inptr4], #1\n"
"zip1 z0.s, z8.s, z12.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z1.s, z8.s, z12.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z2.s, z9.s, z13.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip2 z3.s, z9.s, z13.s\n"
"incw %[outpos], all, mul #1\n"
"zip1 z4.s, z10.s, z14.s\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
"zip2 z5.s, z10.s, z14.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
"whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
+ "zip2 z7.s, z11.s, z15.s\n"
"incw %[outpos], all, mul #1\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
"zip2 z9.s, z0.s, z4.s\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
"zip1 z10.s, z1.s, z5.s\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
"zip2 z11.s, z1.s, z5.s\n"
- "st1w z8.s, p0, [%[outptr]]\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
"zip1 z12.s, z2.s, z6.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z13.s, z2.s, z6.s\n"
- "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
"zip1 z14.s, z3.s, z7.s\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z15.s, z3.s, z7.s\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
@@ -381,67 +396,73 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z6.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
- "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
"incw %[inpos], all, mul #1\n"
+ "ld1w z2.s, p0/z, [%[inptr2]]\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
"zip1 z12.s, z2.s, z6.s\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "ld1w z3.s, p0/z, [%[inptr3]]\n"
+ "zip2 z13.s, z2.s, z6.s\n"
+ "ld1w z4.s, p0/z, [%[inptr4]]\n"
"zip1 z8.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
+ "ld1w z5.s, p0/z, [%[inptr5]]\n"
"zip2 z9.s, z0.s, z4.s\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
"zip1 z10.s, z1.s, z5.s\n"
+ "addvl %[inptr2], %[inptr2], #1\n"
"zip2 z11.s, z1.s, z5.s\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "addvl %[inptr3], %[inptr3], #1\n"
"zip1 z14.s, z3.s, z6.s\n"
- "incw %[outpos], all, mul #1\n"
+ "addvl %[inptr4], %[inptr4], #1\n"
"zip2 z15.s, z3.s, z6.s\n"
+ "addvl %[inptr5], %[inptr5], #1\n"
"zip1 z0.s, z8.s, z12.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z1.s, z8.s, z12.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z2.s, z9.s, z13.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip2 z3.s, z9.s, z13.s\n"
"incw %[outpos], all, mul #1\n"
"zip1 z4.s, z10.s, z14.s\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
"zip2 z5.s, z10.s, z14.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
"whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
+ "zip2 z7.s, z11.s, z15.s\n"
"incw %[outpos], all, mul #1\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
"zip2 z9.s, z0.s, z4.s\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
"zip1 z10.s, z1.s, z5.s\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
"zip2 z11.s, z1.s, z5.s\n"
- "st1w z8.s, p0, [%[outptr]]\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
"zip1 z12.s, z2.s, z6.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z13.s, z2.s, z6.s\n"
- "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
"zip1 z14.s, z3.s, z7.s\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z15.s, z3.s, z7.s\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
@@ -451,68 +472,75 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
"mov z7.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
- "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
- "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
"incw %[inpos], all, mul #1\n"
+ "ld1w z2.s, p0/z, [%[inptr2]]\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
+ "ld1w z3.s, p0/z, [%[inptr3]]\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
"zip1 z14.s, z3.s, z7.s\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "ld1w z4.s, p0/z, [%[inptr4]]\n"
"zip1 z8.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
+ "ld1w z5.s, p0/z, [%[inptr5]]\n"
"zip2 z9.s, z0.s, z4.s\n"
+ "ld1w z6.s, p0/z, [%[inptr6]]\n"
"zip1 z10.s, z1.s, z5.s\n"
+ "addvl %[inptr2], %[inptr2], #1\n"
"zip2 z11.s, z1.s, z5.s\n"
+ "addvl %[inptr3], %[inptr3], #1\n"
"zip1 z12.s, z2.s, z6.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "addvl %[inptr4], %[inptr4], #1\n"
"zip2 z13.s, z2.s, z6.s\n"
- "incw %[outpos], all, mul #1\n"
+ "addvl %[inptr5], %[inptr5], #1\n"
"zip2 z15.s, z3.s, z7.s\n"
+ "addvl %[inptr6], %[inptr6], #1\n"
"zip1 z0.s, z8.s, z12.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z1.s, z8.s, z12.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z2.s, z9.s, z13.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip2 z3.s, z9.s, z13.s\n"
"incw %[outpos], all, mul #1\n"
"zip1 z4.s, z10.s, z14.s\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
"zip2 z5.s, z10.s, z14.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
"whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
+ "zip2 z7.s, z11.s, z15.s\n"
"incw %[outpos], all, mul #1\n"
+ "zip1 z8.s, z0.s, z4.s\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
"zip2 z9.s, z0.s, z4.s\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
"zip1 z10.s, z1.s, z5.s\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
"zip2 z11.s, z1.s, z5.s\n"
- "st1w z8.s, p0, [%[outptr]]\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
"zip1 z12.s, z2.s, z6.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z13.s, z2.s, z6.s\n"
- "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
"zip1 z14.s, z3.s, z7.s\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z15.s, z3.s, z7.s\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
@@ -522,69 +550,77 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *
"1:\n"
"whilelt p0.s, %[inpos], %[inwidth]\n"
"b.none 2f\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
- "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
- "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
- "ld1w z7.s, p0/z, [%[inptr7], %[inpos], LSL #2]\n"
+ "ld1w z0.s, p0/z, [%[inptr0]]\n"
"incw %[inpos], all, mul #1\n"
+ "ld1w z1.s, p0/z, [%[inptr1]]\n"
+ "addvl %[inptr0], %[inptr0], #1\n"
+ "ld1w z2.s, p0/z, [%[inptr2]]\n"
+ "addvl %[inptr1], %[inptr1], #1\n"
+ "ld1w z3.s, p0/z, [%[inptr3]]\n"
+ "addvl %[inptr2], %[inptr2], #1\n"
+ "ld1w z4.s, p0/z, [%[inptr4]]\n"
+ "addvl %[inptr3], %[inptr3], #1\n"
"zip1 z8.s, z0.s, z4.s\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
+ "ld1w z5.s, p0/z, [%[inptr5]]\n"
"zip2 z9.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
+ "ld1w z6.s, p0/z, [%[inptr6]]\n"
"zip1 z10.s, z1.s, z5.s\n"
+ "ld1w z7.s, p0/z, [%[inptr7]]\n"
"zip2 z11.s, z1.s, z5.s\n"
+ "addvl %[inptr4], %[inptr4], #1\n"
"zip1 z12.s, z2.s, z6.s\n"
+ "addvl %[inptr5], %[inptr5], #1\n"
"zip2 z13.s, z2.s, z6.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "addvl %[inptr6], %[inptr6], #1\n"
"zip1 z14.s, z3.s, z7.s\n"
- "incw %[outpos], all, mul #1\n"
+ "addvl %[inptr7], %[inptr7], #1\n"
"zip2 z15.s, z3.s, z7.s\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip1 z0.s, z8.s, z12.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z1.s, z8.s, z12.s\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
"zip1 z2.s, z9.s, z13.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip2 z3.s, z9.s, z13.s\n"
"incw %[outpos], all, mul #1\n"
+ "zip2 z3.s, z9.s, z13.s\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
"zip1 z4.s, z10.s, z14.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z5.s, z10.s, z14.s\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
"zip1 z6.s, z11.s, z15.s\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z7.s, z11.s, z15.s\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
"zip1 z8.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
+ "st1w z8.s, p0, [%[outptr]]\n"
"zip2 z9.s, z0.s, z4.s\n"
+ "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
"zip1 z10.s, z1.s, z5.s\n"
+ "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
"zip2 z11.s, z1.s, z5.s\n"
- "st1w z8.s, p0, [%[outptr]]\n"
+ "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
"zip1 z12.s, z2.s, z6.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
+ "whilelt p0.s, %[outpos], %[outwidth]\n"
"zip2 z13.s, z2.s, z6.s\n"
- "incw %[outpos], all, mul #1\n"
+ "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
"zip1 z14.s, z3.s, z7.s\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+ "incw %[outpos], all, mul #1\n"
"zip2 z15.s, z3.s, z7.s\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+ "whilelt p1.s, %[outpos], %[outwidth]\n"
+ "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
+ "whilelt p2.s, %[outpos], %[outwidth]\n"
+ "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
+ "whilelt p3.s, %[outpos], %[outwidth]\n"
+ "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
"incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
"addvl %[outptr], %[outptr], #8\n"
"b 1b\n"
"2:\n"
: [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
: [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
);
break;
diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
index a1fc00ea89..8b96c328a6 100644
--- a/src/core/NEON/kernels/arm_gemm/utils.hpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,9 +24,7 @@
#pragma once
-#ifdef __ARM_FEATURE_SVE
-#include <arm_sve.h>
-#endif
+#include <cstddef>
// Macro for unreachable code (e.g. impossible default cases on switch)
#define UNREACHABLE(why) __builtin_unreachable()
@@ -49,13 +47,43 @@ inline T roundup(const T a, const T b) {
}
}
+namespace arm_gemm {
+namespace utils {
+namespace {
+
+#ifdef __ARM_FEATURE_SVE
+template<size_t sz>
+inline unsigned long get_vector_length_sz() {
+ unsigned long v;
+
+ __asm (
+ "cntb %0"
+ : "=r" (v)
+ );
+
+ return v / sz;
+}
+
+#define VEC_LEN_SPEC(sz, opcode) template <> inline unsigned long get_vector_length_sz<sz>() { unsigned long v; __asm ( opcode " %0" : "=r" (v)); return v; }
+
+VEC_LEN_SPEC(8, "cntd")
+VEC_LEN_SPEC(4, "cntw")
+VEC_LEN_SPEC(2, "cnth")
+VEC_LEN_SPEC(1, "cntb")
+#endif
+
+} // anonymous namespace
+
template <typename T>
inline unsigned long get_vector_length() {
#ifdef __ARM_FEATURE_SVE
- const unsigned long length = svcntb();
+ return get_vector_length_sz<sizeof(T)>();
#else
- const unsigned long length = 16;
+ return 16 / sizeof(T);
#endif
+}
+
+} // utils namespace
+} // arm_gemm namespace
- return length / sizeof(T);
-} \ No newline at end of file
+using namespace arm_gemm::utils; \ No newline at end of file
diff --git a/src/core/NEON/kernels/assembly/Helpers.cpp b/src/core/NEON/kernels/assembly/Helpers.cpp
index 09ac08c0a4..3d8d66d7fc 100644
--- a/src/core/NEON/kernels/assembly/Helpers.cpp
+++ b/src/core/NEON/kernels/assembly/Helpers.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,91 +24,47 @@
#include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
-#include "NEGEMMInterleavedStrategies.h"
+#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
namespace arm_compute
{
-namespace
-{
-template <typename InputType, bool use_dot = false>
-BlockSizes calculate_block_sizes_template(const CPUInfo &ci, unsigned int M, unsigned int N, unsigned int K)
-{
- using strategy = typename Kernel<InputType, use_dot>::strategy;
- return calculate_block_sizes<strategy>(ci, M, N, K);
-}
-} // namespace
-
-const char *get_strategy_name(DataType input_type, bool use_dot)
+arm_gemm::KernelDescription get_gemm_info(DataType input_type,
+ const CPUInfo &ci,
+ const unsigned int num_threads,
+ const INEGEMMWrapperKernel::Params &p,
+ float alpha,
+ float beta,
+ bool pretranspose_hint)
{
switch(input_type)
{
- case DataType::F32:
- return Kernel<float>::name;
#ifdef __aarch64__
- case DataType::U8:
case DataType::QASYMM8:
- if(use_dot)
- {
- return Kernel<uint8_t, true>::name;
- }
- else
- {
- return Kernel<uint8_t, false>::name;
- }
- case DataType::S8:
- if(use_dot)
- {
- return Kernel<int8_t, true>::name;
- }
- else
- {
- return Kernel<int8_t, false>::name;
- }
-#endif /* __aarch64__ */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Kernel<__fp16>::name;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- default:
- ARM_COMPUTE_ERROR("DataType not supported");
- break;
- }
-}
-
-BlockSizes calculate_block_sizes_from_data_type(const CPUInfo &ci, unsigned int M, unsigned int N, unsigned int K, DataType input_type, bool use_dot)
-{
- switch(input_type)
- {
- case DataType::F32:
- return calculate_block_sizes_template<float>(ci, M, N, K);
-#ifdef __aarch64__
case DataType::U8:
- case DataType::QASYMM8:
- if(use_dot)
- {
- return calculate_block_sizes_template<uint8_t, true>(ci, M, N, K);
- }
- else
- {
- return calculate_block_sizes_template<uint8_t, false>(ci, M, N, K);
- }
+ {
+ arm_gemm::GemmArgs<uint32_t> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
+ return arm_gemm::get_gemm_method<uint8_t, uint32_t>(args);
+ }
case DataType::S8:
- if(use_dot)
- {
- return calculate_block_sizes_template<int8_t, true>(ci, M, N, K);
- }
- else
- {
- return calculate_block_sizes_template<int8_t, false>(ci, M, N, K);
- }
-#endif /* __aarch64__ */
+ {
+ arm_gemm::GemmArgs<int32_t> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
+ return arm_gemm::get_gemm_method<int8_t, int32_t>(args);
+ }
+#endif // __aarch64__
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return calculate_block_sizes_template<__fp16>(ci, M, N, K);
+ {
+ arm_gemm::GemmArgs<__fp16> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
+ return arm_gemm::get_gemm_method<__fp16, __fp16>(args);
+ }
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ case DataType::F32:
+ {
+ arm_gemm::GemmArgs<float> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
+ return arm_gemm::get_gemm_method<float, float>(args);
+ }
default:
- ARM_COMPUTE_ERROR("DataType not supported");
- break;
+ return arm_gemm::KernelDescription();
}
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp
deleted file mode 100644
index 3b2975dd80..0000000000
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
-
-#include "NEGEMMInterleavedStrategies.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/WindowIterator.h"
-
-namespace arm_compute
-{
-template <typename To, typename Tr, bool use_dot>
-void NEGEMMInterleavedMatrixMultiplyWrapperTemplate<To, Tr, use_dot>::configure(const ITensor *prepared_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker,
- const BlockSizes &block_sizes, const INEGEMMWrapperKernel::Params &params, bool b_is_pretransposed, float alpha, float beta, unsigned int max_num_threads)
-{
- using strategy = typename Kernel<To, use_dot>::strategy;
-
- _prepared_a = prepared_a;
- _transformed_b = transformed_b;
- _tmp_c = tmp_c;
- _c = c;
- _block_walker = block_walker;
- _block_sizes = block_sizes;
- _params = params;
- _b_is_pretransposed = b_is_pretransposed;
- _alpha = alpha;
- _beta = beta;
-
- auto_init_if_empty(*_tmp_c->info(), c->info()->clone()->set_tensor_shape(TensorShape{ _block_sizes.x_block * strategy::out_height(), max_num_threads }));
-}
-
-template <typename To, typename Tr, bool use_dot>
-void NEGEMMInterleavedMatrixMultiplyWrapperTemplate<To, Tr, use_dot>::transform(const MatrixMultiplyWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset,
- const Coordinates &end_offset)
-{
- using strategy = typename Kernel<To, use_dot>::strategy;
-
- strategy strat(info.cpu_info);
- TensorAccessor<To> prepared_a(*_prepared_a);
- TensorAccessor<To> transformed_b(*_transformed_b);
- TensorAccessor<Tr> c(*_c);
- TensorAccessor<Tr> tmp_c(*_tmp_c);
-
- int prev_batch = -1;
- To *a_ptr = nullptr;
- auto window_iterator = arm_compute::create_window_iterator(batch_window, start_offset, end_offset, [&](const Coordinates & id)
- {
- const unsigned int y = id.x();
- const unsigned int batch = id.y();
- const unsigned int ymax = std::min(_params.M, y + strategy::out_height());
-
- // If it's the first block of a new batch then reset the pointer to A.
- if(prev_batch != static_cast<int>(batch))
- {
- const unsigned int first_m = id.x();
- a_ptr = prepared_a(0, first_m, batch);
- prev_batch = batch;
- }
-
- // Call matrix multiply assembly routine to process the block:
- strat.kernel(a_ptr, transformed_b(wl._offset_transformed_b), tmp_c(0, info.thread_id), 1, wl._bblocks, wl._kern_k);
- a_ptr += strategy::out_height() * wl._kern_k;
-
- // Merge the result with the other blocks' results:
- strat.transforms.Merge(c(0, 0, batch, wl._multi), tmp_c(0, info.thread_id), c.stride(1), y, ymax, wl._x0, wl._xmax, _alpha, (wl._k0 == 0 ? _beta : static_cast<Tr>(1)));
- });
- auto on_new_row_size = [&](unsigned int start, unsigned int end)
- {
- //Nothing to do
- };
- window_iterator.iterate_2D(on_new_row_size);
-}
-
-template <typename To, typename Tr, bool use_dot>
-void NEGEMMInterleavedMatrixMultiplyWrapperTemplate<To, Tr, use_dot>::create_workloads(std::vector<MatrixMultiplyWorkload> &workloads)
-{
- using strategy = typename Kernel<To, use_dot>::strategy;
-
- unsigned int offset_transformed_b = 0;
- unsigned int wl_index = 0;
- unsigned int num_buffers = 0, reshaped_block_size = 0;
-
- if(!_b_is_pretransposed)
- {
- num_buffers = _transformed_b->info()->tensor_shape()[1];
- reshaped_block_size = _transformed_b->info()->tensor_shape()[0];
- }
- execute_window_loop(_block_walker, [&](const Coordinates & id)
- {
- const unsigned int x0 = id.x();
- const unsigned int k0 = id.y();
- const unsigned int multi = id.z();
-
- const unsigned int xmax = std::min(x0 + _block_walker.x().step(), _params.N);
- const unsigned int kmax = std::min(k0 + _block_walker.y().step(), _params.K);
-
- // Figure out how many "K" the kernel will actually process.
- const int kern_k = ceil_to_multiple(kmax - k0, strategy::k_unroll());
- const int bblocks = DIV_CEIL(xmax - x0, strategy::out_width());
-
- workloads.push_back(MatrixMultiplyWorkload(offset_transformed_b, x0, xmax, k0, kmax, multi, kern_k, bblocks));
-
- if(_b_is_pretransposed)
- {
- offset_transformed_b += bblocks * strategy::out_width() * kern_k;
- }
- else
- {
- // Rotate through the BufferManager's buffers:
- wl_index++;
- offset_transformed_b = (wl_index % num_buffers) * reshaped_block_size;
- }
- });
-}
-
-//TODO: regroup somewhere ?
-template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<float, float>;
-#ifdef __aarch64__
-template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<uint8_t, uint32_t>;
-template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<int8_t, int32_t>;
-template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<uint8_t, uint32_t, true>;
-template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<int8_t, int32_t, true>;
-#endif /* __aarch64__ */
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<float16_t, float16_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp
deleted file mode 100644
index 7fc57f3c02..0000000000
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
-
-#include "NEGEMMInterleavedStrategies.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-
-namespace arm_compute
-{
-namespace
-{
-// Call the lambda function for each workload generated by the passed window.
-template <typename To, bool use_dot, bool use_buffer_manager, typename Lambda>
-void for_each_element_in_window(const Window &window, const ITensor *b, ITensor *transformed_b, unsigned int N, unsigned int K, Lambda &&lambda)
-{
- using strategy = typename Kernel<To, use_dot>::strategy;
- unsigned int wl_index = 0;
- unsigned int num_buffers = 0, reshaped_block_size = 0;
-
- if(use_buffer_manager)
- {
- num_buffers = transformed_b->info()->tensor_shape()[1];
- reshaped_block_size = transformed_b->info()->strides_in_bytes().y();
- }
-
- unsigned int offset_transformed_b = transformed_b->info()->offset_first_element_in_bytes();
- execute_window_loop(window, [&](const Coordinates & coordinates)
- {
- const unsigned int x0 = coordinates.x();
- const unsigned int k0 = coordinates.y();
- const unsigned int multi = coordinates.z();
-
- const unsigned int offset_b = b->info()->offset_element_in_bytes(Coordinates(0, 0, multi));
- const unsigned int xmax = std::min(x0 + window.x().step(), N);
- const unsigned int kmax = std::min(k0 + window.y().step(), K);
-
- /* Figure out the size of each block. */
- unsigned int x_size = (xmax - x0);
- unsigned int k_size = (kmax - k0);
-
- /* Round sizes up as needed. */
- x_size = ceil_to_multiple(x_size, strategy::out_width());
- k_size = ceil_to_multiple(k_size, strategy::k_unroll());
-
- lambda(PrepareBWorkload(offset_b, offset_transformed_b, x0, xmax, k0, kmax));
-
- //Each workload represents one block:
- if(use_buffer_manager)
- {
- // Rotate through the BufferManager's buffers:
- wl_index++;
- offset_transformed_b = (wl_index % num_buffers) * reshaped_block_size;
- }
- else
- {
- offset_transformed_b += (x_size * k_size * sizeof(To));
- }
- });
-}
-
-// Calculate the size of transformed_b:
-template <typename To, bool use_dot>
-unsigned int get_B_pretransposed_array_size(unsigned int N, unsigned int K, const BlockSizes &bs, unsigned int multis)
-{
- using strategy = typename Kernel<To, use_dot>::strategy;
-
- // How many full blocks do N / K contain ?
- size_t num_full_k = K / bs.k_block;
- size_t num_full_x = N / bs.x_block;
-
- ARM_COMPUTE_ERROR_ON(bs.x_block % strategy::out_width() != 0);
- ARM_COMPUTE_ERROR_ON(bs.k_block % strategy::k_unroll() != 0);
-
- size_t normal_x_size = bs.x_block;
- size_t normal_k_size = bs.k_block;
-
- // Round up the leftovers to be a multiple of the strategy processing size:
- size_t left_over_x_size = ceil_to_multiple(N % bs.x_block, strategy::out_width());
- size_t left_over_k_size = ceil_to_multiple(K % bs.k_block, strategy::k_unroll());
-
- // Calculate the total size of the buffer:
- size_t total = num_full_k * normal_k_size * (num_full_x * normal_x_size + left_over_x_size);
- total += left_over_k_size * (left_over_x_size + num_full_x * normal_x_size);
-
- total *= multis;
-
- return total;
-}
-
-} // namespace
-
-template <typename To, bool use_dot>
-BlockSizes NEGEMMInterleavedPrepareBWrapperKernelTemplate<To, use_dot>::block_sizes() const
-{
- return _block_sizes;
-}
-
-template <typename To, bool use_dot>
-void NEGEMMInterleavedPrepareBWrapperKernelTemplate<To, use_dot>::configure(const ITensor *b, ITensor *transformed_b, bool transpose_b, const CPUInfo &ci, const INEGEMMWrapperKernel::Params &params)
-{
- using strategy = typename Kernel<To, use_dot>::strategy;
-
- const unsigned int multis = b->info()->tensor_shape().z();
- _Nsize = b->info()->tensor_shape().x();
- _Ksize = b->info()->tensor_shape().y();
- _b = b;
- _transformed_b = transformed_b;
- _transpose_b = transpose_b;
-
- _block_sizes = calculate_block_sizes<strategy>(ci, params.M, params.N, params.K);
-
- auto_init_if_empty(*transformed_b->info(), b->info()->clone()->set_tensor_shape(TensorShape{ get_B_pretransposed_array_size<To, use_dot>(_Nsize, _Ksize, _block_sizes, multis) }));
-
- Window window;
- window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_Nsize, _block_sizes.x_block), _block_sizes.x_block));
- window.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_Ksize, _block_sizes.k_block), _block_sizes.k_block));
- window.set(Window::DimZ, Window::Dimension(0, multis));
-
- INEKernel::configure(window);
-}
-
-template <typename To, bool use_dot>
-void NEGEMMInterleavedPrepareBWrapperKernelTemplate<To, use_dot>::transform(const PrepareBWorkload &wl, const ThreadInfo &info)
-{
- using strategy = typename Kernel<To, use_dot>::strategy;
-
- strategy strat(info.cpu_info);
- strat.transforms.PrepareB(reinterpret_cast<To *>(_transformed_b->buffer() + wl._offset_transformed_b),
- reinterpret_cast<To *>(_b->buffer() + wl._offset_b),
- _b->info()->strides_in_bytes().y() / sizeof(To),
- wl._x0, wl._xmax, wl._k0, wl._kmax, _transpose_b);
-}
-
-template <typename To, bool use_dot>
-void NEGEMMInterleavedPrepareBWrapperKernelTemplate<To, use_dot>::create_workloads(std::vector<PrepareBWorkload> &workloads)
-{
- for_each_element_in_window<To, use_dot, true>(window(), _b, _transformed_b, _Nsize, _Ksize, [&workloads](PrepareBWorkload && wl)
- {
- workloads.push_back(std::move(wl));
- });
-}
-
-template <typename To, bool use_dot>
-void NEGEMMInterleavedPrepareBWrapperKernelTemplate<To, use_dot>::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(window, INEKernel::window());
- for_each_element_in_window<To, use_dot, false>(window, _b, _transformed_b, _Nsize, _Ksize, [&](PrepareBWorkload && wl)
- {
- this->transform(wl, info);
- });
-}
-
-template class NEGEMMInterleavedPrepareBWrapperKernelTemplate<float>;
-#ifdef __aarch64__
-template class NEGEMMInterleavedPrepareBWrapperKernelTemplate<uint8_t>;
-template class NEGEMMInterleavedPrepareBWrapperKernelTemplate<int8_t>;
-template class NEGEMMInterleavedPrepareBWrapperKernelTemplate<uint8_t, true>;
-template class NEGEMMInterleavedPrepareBWrapperKernelTemplate<int8_t, true>;
-#endif /* __aarch64__ */
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class NEGEMMInterleavedPrepareBWrapperKernelTemplate<float16_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h b/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h
index 69842fec80..da6ef2dea9 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h
+++ b/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,90 +44,175 @@
namespace arm_compute
{
-namespace
+namespace detail
{
-template <typename To, bool use_dot = false>
-struct Kernel
+/** GEMM Interleaved Strategy interface */
+class IInterleavedStrategy
{
+public:
+ /** Virtual Destructor */
+ virtual ~IInterleavedStrategy() = default;
+ /** Instantiate and configure a prepareB Kernel
+ *
+ * @param[in] b Input tensor B.
+ * @param[in] transformed_b Reshaped tensor B.
+ * @param[in] params GM, N, K sizes.
+ * @param[in] ci CPUInfo to be used for kernel configuration.
+ *
+ * @return A wrapped specialized prepareB kernel
+ */
+ virtual std::unique_ptr<NEGEMMInterleavedPrepareBWrapperKernel> instantiate_prepareB(const ITensor *b,
+ ITensor *transformed_b,
+ const INEGEMMWrapperKernel::Params &params,
+ const CPUInfo &ci) = 0;
+ /** Instantiate and configure a transformA Kernel
+ *
+ * @param[in] a Input tensor A.
+ * @param[in] transformed_a Reshaped tensor A.
+ * @param[in] block_walker Window representing the layout of the matrix's blocks.
+ * @param[in] params M, N, K sizes.
+ *
+ * @return A wrapped specialized transformA kernel
+ */
+ virtual std::unique_ptr<NEGEMMInterleavedTransformAWrapper> instantiate_transformA(const ITensor *a,
+ ITensor *transformed_a,
+ const Window &block_walker,
+ const INEGEMMWrapperKernel::Params &params) = 0;
+ /** Instantiate and configure a prepareB Kernel
+ *
+ * @param transformed_a Already reshaped tensor A.
+ * @param transformed_b Already reshaped tensor B.
+ * @param tmp_c Temporary buffer to be used to store intermediate results.
+ * @param c Result tensor C.
+ * @param block_walker Window containing iteration information for the M and batch dimensions.
+ * @param block_sizes Block sizes to use for the matrix multiplication (A & B must have been reshaped using these same block sizes).
+ * @param params M, N, K sizes.
+ * @param alpha Alpha value
+ * @param beta Beta value
+ * @param pretranspose_b Is B also pretransposed ?
+ * @param num_threads Maximum number of threads that might be used for the calculations.
+ *
+ * @return A wrapped specialized MatrixMultiply kernel
+ */
+ virtual std::unique_ptr<NEGEMMInterleavedMatrixMultiplyWrapper> instantiate_matrix_multiply(const ITensor *transformed_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c,
+ const Window &block_walker, const BlockSizes &block_sizes,
+ const INEGEMMWrapperKernel::Params &params, float alpha, float beta, bool pretranspose_b,
+ unsigned int num_threads) = 0;
+ /** Calculates the block sizes of a given strategy
+ *
+ * @param[in] ci CPUInfo to be used for kernel configuration.
+ * @param[in] params M, N, K sizes.
+ *
+ * @return BlockSizes for a given strategy
+ */
+ virtual BlockSizes calculate_block_sizes_for_strategy(const CPUInfo &ci, const INEGEMMWrapperKernel::Params &params) = 0;
};
-#define DEFINE_STRATEGY_SUFFIX(strat, suffix) \
- using strategy = arm_gemm::strat; \
- static constexpr const char *name = #strat suffix;
-
-#define DEFINE_STRATEGY(strat) \
- DEFINE_STRATEGY_SUFFIX(strat, "")
-
-#ifdef __ARM_FEATURE_SVE
-template <>
-struct Kernel<float, false>
-{
- DEFINE_STRATEGY(interleaved_fp32_mla_3VLx8)
-};
-template <>
-struct Kernel<float16_t, false>
-{
- DEFINE_STRATEGY(interleaved_fp16_mla_3VLx8)
-};
-template <bool use_dot>
-struct Kernel<int8_t, use_dot>
-{
- DEFINE_STRATEGY(interleaved_s8s32_dot_3VLx8)
-};
-template <bool use_dot>
-struct Kernel<uint8_t, use_dot>
+/** Interleaved Strategy class */
+template <typename StrategyType>
+class InterleavedStrategy : public IInterleavedStrategy
{
- DEFINE_STRATEGY(interleaved_u8u32_dot_3VLx8)
-};
-#else /* __ARM_FEATURE_SVE */
+public:
+ using strategy = StrategyType;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-struct Kernel<float16_t, false>
-{
- DEFINE_STRATEGY(hgemm_24x8)
-};
-#endif /*__ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-#ifdef __aarch64__
-template <>
-struct Kernel<float, false>
-{
- DEFINE_STRATEGY(sgemm_12x8)
-};
-template <>
-struct Kernel<int8_t, false>
-{
- DEFINE_STRATEGY(gemm_s8_4x4)
-};
-template <>
-struct Kernel<uint8_t, false>
-{
- DEFINE_STRATEGY(gemm_u8_4x4)
-};
+public:
+ // Inherited methods overridden
+ std::unique_ptr<NEGEMMInterleavedPrepareBWrapperKernel> instantiate_prepareB(const ITensor *b,
+ ITensor *transformed_b,
+ const INEGEMMWrapperKernel::Params &params,
+ const CPUInfo &ci) override
+ {
+ auto prepare_b = support::cpp14::make_unique<NEGEMMInterleavedPrepareBWrapperKernelTemplate<strategy>>();
+ prepare_b->configure(b, transformed_b, false, ci, params);
+ return std::move(prepare_b);
+ }
+ std::unique_ptr<NEGEMMInterleavedTransformAWrapper> instantiate_transformA(const ITensor *a,
+ ITensor *transformed_a,
+ const Window &block_walker,
+ const INEGEMMWrapperKernel::Params &params) override
+ {
+ auto transform_a = support::cpp14::make_unique<NEGEMMInterleavedTransformAWrapperTemplate<strategy>>();
+ transform_a->configure(a, transformed_a, false, block_walker, params);
+ return std::move(transform_a);
+ }
+ std::unique_ptr<NEGEMMInterleavedMatrixMultiplyWrapper> instantiate_matrix_multiply(const ITensor *transformed_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c,
+ const Window &block_walker, const BlockSizes &block_sizes,
+ const INEGEMMWrapperKernel::Params &params, float alpha, float beta, bool pretranspose_b,
+ unsigned int num_threads) override
+ {
+ auto matrix_multiply = support::cpp14::make_unique<NEGEMMInterleavedMatrixMultiplyWrapperTemplate<strategy>>();
+ matrix_multiply->configure(transformed_a, transformed_b, tmp_c, c, block_walker, block_sizes, params, pretranspose_b, alpha, beta, num_threads);
+ return std::move(matrix_multiply);
+ }
-//Use different strategies for 8bit dot product:
-template <>
-struct Kernel<int8_t, true>
-{
- DEFINE_STRATEGY_SUFFIX(gemm_s8_12x8, "_dot")
+ BlockSizes calculate_block_sizes_for_strategy(const CPUInfo &ci, const INEGEMMWrapperKernel::Params &params) override
+ {
+ return calculate_block_sizes<strategy>(ci, params.M, params.N, params.K);
+ }
};
-template <>
-struct Kernel<uint8_t, true>
-{
- DEFINE_STRATEGY_SUFFIX(gemm_u8_12x8, "_dot")
-};
-#else
-template <>
-struct Kernel<float, false>
-{
- DEFINE_STRATEGY(sgemm_8x6)
-};
-#endif /* __aarch64__ */
-#endif /* __ARM_FEATURE_SVE */
-
-#undef DEFINE_STRATEGY
-#undef DEFINE_STRATEGY_SUFFIX
-} // namespace
+/** Create the backend GEMM strategy to use given the provided kernel info
+ *
+ * @param[in] kernel_name Kernel name of the backend strategy to instantiate
+ *
+ * @return The requested kernel strategy if exists else nullptr
+ */
+std::unique_ptr<IInterleavedStrategy> create_strategy(const std::string &kernel_name)
+{
+#if defined(__arm__)
+ if(kernel_name.find("sgemm_8x6") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::sgemm_8x6>>();
+ }
+#endif // defined(__arm__)
+#if defined(__aarch64__)
+ if(kernel_name.find("gemm_s8_4x4") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::gemm_s8_4x4>>();
+ }
+ if(kernel_name.find("gemm_s8_12x8") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::gemm_s8_12x8>>();
+ }
+ if(kernel_name.find("gemm_u8_4x4") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::gemm_u8_4x4>>();
+ }
+ if(kernel_name.find("gemm_u8_12x8") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::gemm_u8_12x8>>();
+ }
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ if(kernel_name.find("hgemm_24x8") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::hgemm_24x8>>();
+ }
+#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ if(kernel_name.find("sgemm_12x8") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::sgemm_12x8>>();
+ }
+#if defined(__ARM_FEATURE_SVE)
+ if(kernel_name.find("interleaved_fp16_mla_3VLx8") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::interleaved_fp16_mla_3VLx8>>();
+ }
+ if(kernel_name.find("interleaved_fp32_mla_3VLx8") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::interleaved_fp32_mla_3VLx8>>();
+ }
+ if(kernel_name.find("interleaved_s8s32_dot_3VLx8") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::interleaved_s8s32_dot_3VLx8>>();
+ }
+ if(kernel_name.find("interleaved_u8u32_dot_3VLx8") != std::string::npos)
+ {
+ return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::interleaved_u8u32_dot_3VLx8>>();
+ }
+#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(__aarch64__)_
+ return nullptr;
+}
+} // namespace detail
} // namespace arm_compute
#endif /* __ARM_COMPUTE_NEGEMMINTERLEAVEDSTRATEGIES_H__ */
diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp
deleted file mode 100644
index 3b80a1f940..0000000000
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
-
-#include "NEGEMMInterleavedStrategies.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/WindowIterator.h"
-
-#include "utils/TypePrinter.h"
-
-namespace arm_compute
-{
-template <typename To, bool use_dot>
-void NEGEMMInterleavedTransformAWrapperTemplate<To, use_dot>::configure(const ITensor *a, ITensor *transformed_a, bool transpose_a, const Window &block_walker,
- const INEGEMMWrapperKernel::Params &params)
-{
- _a = a;
- _transformed_a = transformed_a;
- _transpose_a = transpose_a;
- _Ksize = params.K;
- _Msize = params.M;
- _k_multi_window = block_walker.shift_dimensions(1); // block_walker contains (M,K,Multi) --> shift by 1 to get rid of the "M" dimension
-}
-
-template <typename To, bool use_dot>
-void NEGEMMInterleavedTransformAWrapperTemplate<To, use_dot>::transform(const TransformAWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset,
- const Coordinates &end_offset)
-{
- using strategy = typename Kernel<To, use_dot>::strategy;
-
- strategy strat(info.cpu_info);
- TensorAccessor<To> a(*_a);
- TensorAccessor<To> transformed_a(*_transformed_a);
-
- if(_a->info()->data_layout() == DataLayout::NHWC)
- {
- // In the case of NHWC we want to interpret the output shape as 3D. Thus, the batch stride for A is
- // the relevant multiple of the row stride.
- const size_t nhwc_batch_stride = _a->info()->strides_in_bytes().y() * _Msize;
- a.set_stride(2, nhwc_batch_stride);
- }
-
- unsigned int last_m = 0;
- //TODO: Create a new iterate_1D( DimY);
- int last_y = -1;
- auto window_iterator = arm_compute::create_window_iterator(batch_window, start_offset, end_offset, [&](const Coordinates & id)
- {
- if(id.y() != last_y)
- {
- last_y = id.y();
- unsigned int batch = id.y();
- unsigned int first_m = id.x();
-
- if(first_m >= last_m)
- return;
-
- strat.transforms.PrepareA(transformed_a(0, first_m, batch),
- a(0, 0, batch, wl._multi),
- a.stride(1), first_m, last_m, wl._k0, wl._kmax, _transpose_a);
- }
- });
- auto on_new_row_size = [&](unsigned int start, unsigned int end)
- {
- last_m = std::min(end, _Msize);
- };
- window_iterator.iterate_2D(on_new_row_size);
-}
-
-template <typename To, bool use_dot>
-void NEGEMMInterleavedTransformAWrapperTemplate<To, use_dot>::create_workloads(std::vector<TransformAWorkload> &workloads)
-{
- execute_window_loop(_k_multi_window, [&](const Coordinates & id)
- {
- const unsigned int k0 = id.x();
- const unsigned int multi = id.y();
- const unsigned int kmax = std::min(k0 + _k_multi_window.x().step(), _Ksize);
-
- workloads.push_back(TransformAWorkload(k0, kmax, multi));
- });
-}
-
-template class NEGEMMInterleavedTransformAWrapperTemplate<float>;
-#ifdef __aarch64__
-template class NEGEMMInterleavedTransformAWrapperTemplate<uint8_t>;
-template class NEGEMMInterleavedTransformAWrapperTemplate<int8_t>;
-template class NEGEMMInterleavedTransformAWrapperTemplate<uint8_t, true>;
-template class NEGEMMInterleavedTransformAWrapperTemplate<int8_t, true>;
-#endif /* __aarch64__ */
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class NEGEMMInterleavedTransformAWrapperTemplate<float16_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp b/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp
index e452dfbcf2..7b1f3e7ba0 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp
+++ b/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,11 +34,7 @@
#include "../arm_gemm/mergeresults.hpp"
#include "../arm_gemm/transform.hpp"
-#include "../arm_gemm/kernels/a32_sgemm_8x6.hpp"
-#include "../arm_gemm/kernels/a64_sgemm_12x8.hpp"
#include "../arm_gemm/kernels/a64_sgemm_native_16x4.hpp"
-#include "../arm_gemm/kernels/a64_sgemv_pretransposed.hpp"
-#include "../arm_gemm/kernels/a64_sgemv_trans.hpp"
namespace arm_compute
{
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 25be4a5349..cd614ba582 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,9 +24,6 @@
#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
#include "arm_compute/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
@@ -38,14 +35,14 @@ namespace arm_compute
{
namespace
{
-std::unique_ptr<IFunction> create_function_all_types(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
+std::unique_ptr<IFunction> create_function_all_types(arm_gemm::KernelDescription gemm_kernel_info,
+ const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
std::shared_ptr<IMemoryManager> memory_manager)
{
//Note: It's safe to not check for FP16 support because this was already checked in NEGEMMAssemblyDispatch::configure()
- switch(method)
+ switch(gemm_kernel_info.method)
{
- case arm_gemm::GemmMethod::GEMM_INTERLEAVED_FP16:
case arm_gemm::GemmMethod::GEMM_INTERLEAVED:
{
if(!pretranspose_hint)
@@ -56,92 +53,24 @@ std::unique_ptr<IFunction> create_function_all_types(arm_gemm::GemmMethod method
function->configure(a, b, d, alpha, beta, pretranspose_hint);
return std::move(function);
}
- default:
- return nullptr;
- }
-}
-
-template <typename TypeInput, typename TypeOutput>
-std::unique_ptr<IFunction> create_function(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
- std::shared_ptr<IMemoryManager> memory_manager)
-{
- ARM_COMPUTE_UNUSED(method);
- ARM_COMPUTE_UNUSED(a);
- ARM_COMPUTE_UNUSED(b);
- ARM_COMPUTE_UNUSED(d);
- ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_UNUSED(beta);
- ARM_COMPUTE_UNUSED(pretranspose_hint);
- ARM_COMPUTE_UNUSED(memory_manager);
- return nullptr;
-}
-
-#ifdef __aarch64__
-template <>
-std::unique_ptr<IFunction> create_function<int8_t, int32_t>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
- std::shared_ptr<IMemoryManager> memory_manager)
-{
- switch(method)
- {
- case arm_gemm::GemmMethod::GEMM_INTERLEAVED_DOT:
- {
- if(!pretranspose_hint)
- {
- return nullptr;
- }
- auto function = support::cpp14::make_unique<NEGEMMInterleavedWrapper>(memory_manager);
- function->configure(a, b, d, alpha, beta, pretranspose_hint, true /* use_dot */);
- return std::move(function);
- }
- default:
- return nullptr;
- }
- return nullptr;
-}
-
-template <>
-std::unique_ptr<IFunction> create_function<uint8_t, uint32_t>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
- std::shared_ptr<IMemoryManager> memory_manager)
-{
- switch(method)
- {
- case arm_gemm::GemmMethod::GEMM_INTERLEAVED_DOT:
+#if defined(__aarch64__)
+ case arm_gemm::GemmMethod::GEMM_NATIVE:
{
- if(!pretranspose_hint)
+ if(gemm_kernel_info.name.find("sgemm_native_16x4") != std::string::npos)
{
- return nullptr;
+ auto kernel = support::cpp14::make_unique<NEGEMMNativeWrapperKernel<float, float>>();
+ kernel->configure(a, b, d, alpha, beta);
+ auto function = support::cpp14::make_unique<NESimpleAssemblyFunction>();
+ function->configure(std::move(kernel));
+ return std::move(function);
}
- auto function = support::cpp14::make_unique<NEGEMMInterleavedWrapper>(memory_manager);
- function->configure(a, b, d, alpha, beta, pretranspose_hint, true /* use_dot */);
- return std::move(function);
- }
- default:
return nullptr;
- }
- return nullptr;
-}
-
-template <>
-std::unique_ptr<IFunction> create_function<float, float>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
- std::shared_ptr<IMemoryManager> memory_manager)
-{
- ARM_COMPUTE_UNUSED(pretranspose_hint);
- ARM_COMPUTE_UNUSED(memory_manager);
- switch(method)
- {
- case arm_gemm::GemmMethod::GEMM_NATIVE:
- {
- auto kernel = support::cpp14::make_unique<NEGEMMNativeWrapperKernel<float, float>>();
- kernel->configure(a, b, d, alpha, beta);
- auto function = support::cpp14::make_unique<NESimpleAssemblyFunction>();
- function->configure(std::move(kernel));
- return std::move(function);
}
+#endif // defined(__aarch64__)
default:
return nullptr;
}
}
-#endif /* __aarch64__ */
/** Fallback in case ACL doesn't have a function */
template <typename TypeInput, typename TypeOutput>
@@ -189,7 +118,7 @@ private:
template <typename TypeInput, typename TypeOutput>
void Fallback<TypeInput, TypeOutput>::configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> &args, MemoryGroup &memory_group)
{
- _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput>(args, nullptr);
+ _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput>(args);
if(_gemm_kernel_asm == nullptr)
{
//configuration not supported: Leave function unconfigured:
@@ -334,12 +263,8 @@ void create_function_or_arm_gemm(std::unique_ptr<IFunction> &acl_function, std::
arm_gemm::GemmArgs<TypeOutput> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
//Try to create an ACL function:
- acl_function = create_function_all_types(arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args), a, b, d, alpha, beta, pretranspose_hint, memory_manager);
- // If the type agnostic factory failed to create an ACL function, try the specialised one:
- if(acl_function == nullptr)
- {
- acl_function = create_function<TypeInput, TypeOutput>(arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args), a, b, d, alpha, beta, pretranspose_hint, memory_manager);
- }
+ acl_function = create_function_all_types(arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args), a, b, d, alpha, beta, pretranspose_hint, std::move(memory_manager));
+
//If we still don't have an ACL function:
if(acl_function == nullptr)
{
diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
index fe998a0e42..695fc859de 100644
--- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
+++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,12 +26,11 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h"
+
#include <atomic>
#include <condition_variable>
#include <mutex>
@@ -179,6 +178,7 @@ NEGEMMInterleavedWrapper::NEGEMMInterleavedWrapper(std::shared_ptr<IMemoryManage
: _memory_group(std::move(memory_manager))
{
}
+
void NEGEMMInterleavedWrapper::run()
{
prepare();
@@ -334,38 +334,7 @@ void NEGEMMInterleavedWrapper::prepare()
}
}
-namespace
-{
-// Factory to instantiate NEGEMMInterleavedPrepareBWrapperKernel:
-template <typename InputType, bool use_dot = false>
-std::unique_ptr<NEGEMMInterleavedPrepareBWrapperKernel> instantiate_prepareB(const ITensor *b, ITensor *transformed_b, const INEGEMMWrapperKernel::Params &params)
-{
- auto prepare_b = support::cpp14::make_unique<NEGEMMInterleavedPrepareBWrapperKernelTemplate<InputType, use_dot>>();
- prepare_b->configure(b, transformed_b, false, NEScheduler::get().cpu_info(), params);
- return std::move(prepare_b);
-}
-
-// Factory to instantiate NEGEMMInterleavedTransformAWrapperTemplate:
-template <typename InputType, bool use_dot = false>
-std::unique_ptr<NEGEMMInterleavedTransformAWrapper> instantiate_transformA(const ITensor *a, ITensor *transformed_a, const Window &block_walker, const INEGEMMWrapperKernel::Params &params)
-{
- auto transform_a = support::cpp14::make_unique<NEGEMMInterleavedTransformAWrapperTemplate<InputType, use_dot>>();
- transform_a->configure(a, transformed_a, false, block_walker, params);
- return std::move(transform_a);
-}
-
-// Factory to instantiate NEGEMMInterleavedTransformAWrapperTemplate:
-template <typename InputType, typename OutputType, bool use_dot = false>
-std::unique_ptr<NEGEMMInterleavedMatrixMultiplyWrapper> instantiate_matrix_multiply(const ITensor *transformed_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker,
- const BlockSizes &block_sizes, const INEGEMMWrapperKernel::Params &params, bool pretranspose_b, float alpha, float beta)
-{
- auto matrix_multiply = support::cpp14::make_unique<NEGEMMInterleavedMatrixMultiplyWrapperTemplate<InputType, OutputType, use_dot>>();
- matrix_multiply->configure(transformed_a, transformed_b, tmp_c, c, block_walker, block_sizes, params, pretranspose_b, alpha, beta, NEScheduler::get().num_threads());
- return std::move(matrix_multiply);
-}
-} // namespace
-
-void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, bool pretranspose_b, bool use_dot)
+void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, bool pretranspose_b)
{
_params = INEGEMMWrapperKernel::extract_parameters(a, b, c);
_a = a;
@@ -373,18 +342,26 @@ void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITe
_c = c;
_pretranspose_b = pretranspose_b;
- DataType input_type = a->info()->data_type();
+ const DataType input_type = a->info()->data_type();
+ const CPUInfo &ci = NEScheduler::get().cpu_info();
+ const unsigned int num_threads = NEScheduler::get().num_threads();
+
+ const arm_gemm::KernelDescription gemm_kernel_info = get_gemm_info(input_type, ci, num_threads, _params, alpha, beta, pretranspose_b);
+ ARM_COMPUTE_ERROR_ON(gemm_kernel_info.method != arm_gemm::GemmMethod::GEMM_INTERLEAVED);
// Forcing 128-byte alignment (required by 32-bit kernels)
const unsigned int alignment = 128;
_transformed_b.allocator()->init(TensorInfo{}, alignment);
_tmp_c.allocator()->init(TensorInfo{}, alignment);
- _tag = "NEGEMMInterleaved_";
- _tag += get_strategy_name(input_type, use_dot);
+ _tag = "NEGEMMInterleaved_" + gemm_kernel_info.name;
+
+ // Get strategy
+ std::unique_ptr<detail::IInterleavedStrategy> strategy = detail::create_strategy(gemm_kernel_info.name);
+ ARM_COMPUTE_ERROR_ON(strategy == nullptr);
if(!_pretranspose_b)
{
- _block_sizes = calculate_block_sizes_from_data_type(NEScheduler::get().cpu_info(), _params.M, _params.N, _params.K, input_type, use_dot);
+ _block_sizes = strategy->calculate_block_sizes_for_strategy(ci, _params);
_batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
_batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
// If the execution is single threaded or has only one window then the buffer manager only needs 1 buffer else we will use NUM_BUFFERS buffers and ping pong between them:
@@ -409,43 +386,8 @@ void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITe
{
_tag += "_preB";
}
- switch(input_type)
- {
- case DataType::F32:
- _prepare_b = instantiate_prepareB<float>(_b, &_transformed_b, _params);
- break;
-#ifdef __aarch64__
- case DataType::U8:
- case DataType::QASYMM8:
- if(use_dot)
- {
- _prepare_b = instantiate_prepareB<uint8_t, true>(_b, &_transformed_b, _params);
- }
- else
- {
- _prepare_b = instantiate_prepareB<uint8_t, false>(_b, &_transformed_b, _params);
- }
- break;
- case DataType::S8:
- if(use_dot)
- {
- _prepare_b = instantiate_prepareB<int8_t, true>(_b, &_transformed_b, _params);
- }
- else
- {
- _prepare_b = instantiate_prepareB<int8_t, false>(_b, &_transformed_b, _params);
- }
- break;
-#endif /* __aarch64__ */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- _prepare_b = instantiate_prepareB<__fp16>(_b, &_transformed_b, _params);
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- default:
- ARM_COMPUTE_ERROR("DataType not supported");
- break;
- }
+
+ _prepare_b = strategy->instantiate_prepareB(b, &_transformed_b, _params, ci);
ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
if(_pretranspose_b)
@@ -463,51 +405,11 @@ void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITe
_memory_group.manage(&_transformed_a);
_memory_group.manage(&_tmp_c);
- switch(input_type)
- {
- case DataType::F32:
- _transform_a = instantiate_transformA<float>(_a, &_transformed_a, _block_walker, _params);
- _matrix_multiply = instantiate_matrix_multiply<float, float>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
- break;
-#ifdef __aarch64__
- case DataType::U8:
- case DataType::QASYMM8:
- if(use_dot)
- {
- _transform_a = instantiate_transformA<uint8_t, true>(_a, &_transformed_a, _block_walker, _params);
- _matrix_multiply = instantiate_matrix_multiply<uint8_t, uint32_t, true>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
- }
- else
- {
- _transform_a = instantiate_transformA<uint8_t, false>(_a, &_transformed_a, _block_walker, _params);
- _matrix_multiply = instantiate_matrix_multiply<uint8_t, uint32_t, false>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
- }
- break;
- case DataType::S8:
- if(use_dot)
- {
- _transform_a = instantiate_transformA<int8_t, true>(_a, &_transformed_a, _block_walker, _params);
- _matrix_multiply = instantiate_matrix_multiply<int8_t, int32_t, true>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
- }
- else
- {
- _transform_a = instantiate_transformA<int8_t, false>(_a, &_transformed_a, _block_walker, _params);
- _matrix_multiply = instantiate_matrix_multiply<int8_t, int32_t, false>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
- }
- break;
-#endif /* __aarch64__ */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- _transform_a = instantiate_transformA<__fp16>(_a, &_transformed_a, _block_walker, _params);
- _matrix_multiply = instantiate_matrix_multiply<__fp16, __fp16>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
- break;
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- default:
- break;
- }
+ _transform_a = strategy->instantiate_transformA(_a, &_transformed_a, _block_walker, _params);
+ _matrix_multiply = strategy->instantiate_matrix_multiply(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, alpha, beta, pretranspose_b, num_threads);
ARM_COMPUTE_ERROR_ON(_transform_a == nullptr);
ARM_COMPUTE_ERROR_ON(_matrix_multiply == nullptr);
+
_transformed_a.allocator()->allocate();
_tmp_c.allocator()->allocate();
if(!_pretranspose_b)