From 7cd26d4a1b14bc4bf7c61496803416ab3d84791f Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 9 Jan 2019 18:35:17 +0000 Subject: COMPMID-1867: Add NEON/SVE GEMM Hybrid kernels. Change-Id: Ib40a9921e7f9a6a8be6c38872d6b3a0f24ed0cd3 Reviewed-on: https://review.mlplatform.org/515 Reviewed-by: Anthony Barbier Tested-by: Arm Jenkins --- arm_compute/core/NEON/kernels/assembly/Helpers.h | 41 +- .../NEGEMMInterleavedMatrixMultiplyWrapper.h | 130 +- .../NEGEMMInterleavedPrepareBWrapperKernel.h | 140 +- .../assembly/NEGEMMInterleavedTransformAWrapper.h | 71 +- .../core/NEON/kernels/assembly/arm_gemm.hpp | 88 +- .../core/NEON/kernels/assembly/gemm_common.hpp | 11 +- .../functions/assembly/NEGEMMInterleavedWrapper.h | 17 +- src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp | 92 +- src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp | 211 +- src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp | 303 ++ .../NEON/kernels/arm_gemm/gemm_implementation.hpp | 153 +- src/core/NEON/kernels/arm_gemm/gemm_int16.cpp | 39 +- src/core/NEON/kernels/arm_gemm/gemm_int8.cpp | 90 +- .../NEON/kernels/arm_gemm/gemm_interleaved.hpp | 71 +- src/core/NEON/kernels/arm_gemm/gemm_native.hpp | 21 +- src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp | 41 +- src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp | 86 +- src/core/NEON/kernels/arm_gemm/gemv_batched.hpp | 4 +- .../kernels/arm_gemm/gemv_native_transposed.hpp | 9 +- .../NEON/kernels/arm_gemm/gemv_pretransposed.hpp | 21 +- .../arm_gemm/kernels/a64_hgemm_24x8/generic.cpp | 6 +- .../a64_sgemm_nativeA_pretransposeB_16x4.hpp | 78 + .../generic.cpp | 970 ++++ .../arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp | 74 + .../kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp | 2005 +++++++++ .../kernels/sve_interleaved_fp16_mla_3VLx8.hpp | 4 +- .../sve_interleaved_fp16_mla_3VLx8/generic.cpp | 48 +- .../kernels/sve_interleaved_fp32_mla_3VLx8.hpp | 4 +- .../sve_interleaved_fp32_mla_3VLx8/generic.cpp | 46 +- .../kernels/sve_interleaved_s8s32_dot_3VLx8.hpp | 4 +- .../sve_interleaved_s8s32_dot_3VLx8/generic.cpp | 46 +- .../kernels/sve_interleaved_u8u32_dot_3VLx8.hpp | 4 +- .../arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp | 73 + .../kernels/sve_native_fp32_mla_4VLx4/generic.cpp | 2066 +++++++++ .../kernels/sve_native_s8s32_dot_4VLx4.hpp | 73 + .../kernels/sve_native_s8s32_dot_4VLx4/generic.cpp | 4632 ++++++++++++++++++++ .../kernels/sve_native_u8u32_dot_4VLx4.hpp | 74 + .../kernels/sve_native_u8u32_dot_4VLx4/generic.cpp | 4632 ++++++++++++++++++++ .../arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp | 73 + .../kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp | 4264 ++++++++++++++++++ .../kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp | 73 + .../sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp | 4004 +++++++++++++++++ .../arm_gemm/merges/a64_merge_fp32_12x8.hpp | 1660 +++++++ src/core/NEON/kernels/arm_gemm/transform.hpp | 13 +- .../transforms/a64_interleave_8way_32bit.hpp | 6 +- src/core/NEON/kernels/arm_gemm/transforms/list.hpp | 15 +- .../transforms/sve_interleave_8way_32bit.hpp | 470 +- src/core/NEON/kernels/arm_gemm/utils.hpp | 44 +- src/core/NEON/kernels/assembly/Helpers.cpp | 100 +- .../NEGEMMInterleavedMatrixMultiplyWrapper.cpp | 152 - .../NEGEMMInterleavedPrepareBWrapperKernel.cpp | 189 - .../kernels/assembly/NEGEMMInterleavedStrategies.h | 239 +- .../NEGEMMInterleavedTransformAWrapper.cpp | 118 - .../kernels/assembly/NEGEMMNativeWrapperKernel.cpp | 6 +- .../NEON/functions/NEGEMMAssemblyDispatch.cpp | 107 +- .../assembly/NEGEMMInterleavedWrapper.cpp | 142 +- 56 files changed, 26516 insertions(+), 1637 deletions(-) create mode 100644 src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp delete mode 100644 src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp delete mode 100644 src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp delete mode 100644 src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp diff --git a/arm_compute/core/NEON/kernels/assembly/Helpers.h b/arm_compute/core/NEON/kernels/assembly/Helpers.h index 11c4c08086..e2a46e96a3 100644 --- a/arm_compute/core/NEON/kernels/assembly/Helpers.h +++ b/arm_compute/core/NEON/kernels/assembly/Helpers.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -27,6 +27,9 @@ #include "arm_compute/core/CPP/CPPTypes.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h" +#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp" + namespace arm_compute { /** Block sizes to use to break the M, N, K dimension */ @@ -38,31 +41,29 @@ struct BlockSizes unsigned int strategy_out_height{ 0 }; /**< Number of rows (M) processed by the selected strategy */ }; -/** Calculate the recommended block sizes to use based on the CPU cache sizes and data type - * - * @param[in] ci CPU information - * @param[in] M M dimension. - * @param[in] N N dimension. - * @param[in] K K dimension. - * @param[in] input_type Input data type - * @param[in] use_dot (Optional) If data_type is QASYMM8/U8/S8, then use the dot product instruction ? - * - * @return Recommeded block sizes to use for the given M, N, K dimensions. - */ -BlockSizes calculate_block_sizes_from_data_type(const CPUInfo &ci, unsigned int M, unsigned int N, unsigned int K, DataType input_type, bool use_dot = false); - -/** Get the name of the GEMM strategy which will be used for a given input type +/** Extracts the kernel description of the selected kernel by the GEMM backend heuristics * - * @param[in] input_type Input data type - * @param[in] use_dot (Optional) If data_type is QASYMM8/U8/S8, then use the dot product instruction ? + * @param[in] input_type Data type of the input tensor. + * @param[in] ci CPU information. + * @param[in] num_threads Maximum number of threads that might be used for the calculations. + * @param[in] p M, N, K sizes. + * @param[in] alpha Alpha value. + * @param[in] beta Beta value. + * @param[in] pretranspose_hint Is B also pretransposed ? * - * @return The name of the strategy that will be used + * @return Kernel description that the assembly heuristics picked for the given configuration */ -const char *get_strategy_name(DataType input_type, bool use_dot = false); +arm_gemm::KernelDescription get_gemm_info(DataType input_type, + const CPUInfo &ci, + const unsigned int num_threads, + const INEGEMMWrapperKernel::Params &p, + float alpha, + float beta, + bool pretranspose_hint); /** Calculate the recommended block sizes to use based on the CPU cache sizes and the strategy which will be used * - * @param[in] ci CPU information + * @param[in] ci CPU information. * @param[in] M M dimension. * @param[in] N N dimension. * @param[in] K K dimension. diff --git a/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h b/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h index 46a05abcdb..e2b849aa3d 100644 --- a/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h +++ b/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -26,8 +26,13 @@ #include "arm_compute/core/NEON/kernels/assembly/Helpers.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" #include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "arm_compute/core/WindowIterator.h" namespace arm_compute { @@ -84,7 +89,7 @@ public: }; /** Equivalent to arm_gemm::GemmInterleaved's strategy::kernel() but using Compute Library types. */ -template +template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate : public NEGEMMInterleavedMatrixMultiplyWrapper { public: @@ -94,7 +99,7 @@ public: * @param[in] transformed_b Already reshaped matrix B. * @param[out] tmp_c Temporary buffer to be used to store intermediate results. * @param[in,out] c Result matrix C. - * @param[in] batch_window Window containing iteration information for the M and batch dimensions. + * @param[in] block_walker Window containing iteration information for the M and batch dimensions. * @param[in] block_sizes Block sizes to use for the matrix multiplication (A & B must have been reshaped using these same block sizes). * @param[in] params M, N, K sizes. * @param[in] is_pretransposed Is B also pretransposed ? @@ -102,30 +107,117 @@ public: * @param[in] beta Beta value * @param[in] max_num_threads Maximum number of threads that might be used for the calculations. */ - void configure(const ITensor *prepared_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &batch_window, const BlockSizes &block_sizes, - const INEGEMMWrapperKernel::Params ¶ms, bool b_is_pretransposed, float alpha, float beta, unsigned int max_num_threads); + void configure(const ITensor *prepared_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker, const BlockSizes &block_sizes, + const INEGEMMWrapperKernel::Params ¶ms, bool b_is_pretransposed, float alpha, float beta, unsigned int max_num_threads) + { + _prepared_a = prepared_a; + _transformed_b = transformed_b; + _tmp_c = tmp_c; + _c = c; + _block_walker = block_walker; + _block_sizes = block_sizes; + _params = params; + _b_is_pretransposed = b_is_pretransposed; + _alpha = alpha; + _beta = beta; + + auto_init_if_empty(*_tmp_c->info(), c->info()->clone()->set_tensor_shape(TensorShape{ _block_sizes.x_block * strategy::out_height(), max_num_threads })); + } // Inherited methods overridden: - void transform(const MatrixMultiplyWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset, const Coordinates &end_offset) override; - void create_workloads(std::vector &workloads) override; + void transform(const MatrixMultiplyWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset, const Coordinates &end_offset) override + { + strategy strat(info.cpu_info); + TensorAccessor prepared_a(*_prepared_a); + TensorAccessor transformed_b(*_transformed_b); + TensorAccessor c(*_c); + TensorAccessor tmp_c(*_tmp_c); + + int prev_batch = -1; + typename strategy::operand_type *a_ptr = nullptr; + auto window_iterator = arm_compute::create_window_iterator(batch_window, start_offset, end_offset, [&](const Coordinates & id) + { + const unsigned int y = id.x(); + const unsigned int batch = id.y(); + const unsigned int ymax = std::min(_params.M, y + strategy::out_height()); + + // If it's the first block of a new batch then reset the pointer to A. + if(prev_batch != static_cast(batch)) + { + const unsigned int first_m = id.x(); + a_ptr = prepared_a(0, first_m, batch); + prev_batch = batch; + } + + // Call matrix multiply assembly routine to process the block: + strat.kernel(a_ptr, transformed_b(wl._offset_transformed_b), tmp_c(0, info.thread_id), 1, wl._bblocks, wl._kern_k); + a_ptr += strategy::out_height() * wl._kern_k; + + // Merge the result with the other blocks' results: + strat.transforms.Merge(c(0, 0, batch, wl._multi), tmp_c(0, info.thread_id), c.stride(1), y, ymax, wl._x0, wl._xmax, _alpha, (wl._k0 == 0 ? _beta : static_cast(1))); + }); + auto on_new_row_size = [&](unsigned int start, unsigned int end) + { + //Nothing to do + }; + window_iterator.iterate_2D(on_new_row_size); + } + void create_workloads(std::vector &workloads) override + { + unsigned int offset_transformed_b = 0; + unsigned int wl_index = 0; + unsigned int num_buffers = 0, reshaped_block_size = 0; + + if(!_b_is_pretransposed) + { + num_buffers = _transformed_b->info()->tensor_shape()[1]; + reshaped_block_size = _transformed_b->info()->tensor_shape()[0]; + } + execute_window_loop(_block_walker, [&](const Coordinates & id) + { + const unsigned int x0 = id.x(); + const unsigned int k0 = id.y(); + const unsigned int multi = id.z(); + + const unsigned int xmax = std::min(x0 + _block_walker.x().step(), _params.N); + const unsigned int kmax = std::min(k0 + _block_walker.y().step(), _params.K); + + // Figure out how many "K" the kernel will actually process. + const int kern_k = ceil_to_multiple(kmax - k0, strategy::k_unroll()); + const int bblocks = DIV_CEIL(xmax - x0, strategy::out_width()); + + workloads.push_back(MatrixMultiplyWorkload(offset_transformed_b, x0, xmax, k0, kmax, multi, kern_k, bblocks)); + + if(_b_is_pretransposed) + { + offset_transformed_b += bblocks * strategy::out_width() * kern_k; + } + else + { + // Rotate through the BufferManager's buffers: + wl_index++; + offset_transformed_b = (wl_index % num_buffers) * reshaped_block_size; + } + }); + } private: const ITensor *_prepared_a { nullptr }; - const ITensor *_transformed_b{ nullptr }; - ITensor *_tmp_c{ nullptr }; - ITensor *_c{ nullptr }; - unsigned int _Nsize{ 0 }; - unsigned int _Ksize{ 0 }; - bool _transpose_b{ false }; - BlockSizes _block_sizes{}; - INEGEMMWrapperKernel::Params _params{}; - Window _block_walker{}; - bool _b_is_pretransposed{ false }; - Tr _alpha{}; - Tr _beta{}; + const ITensor *_transformed_b{ nullptr }; + ITensor *_tmp_c{ nullptr }; + ITensor *_c{ nullptr }; + unsigned int _Nsize{ 0 }; + unsigned int _Ksize{ 0 }; + bool _transpose_b{ false }; + BlockSizes _block_sizes{}; + INEGEMMWrapperKernel::Params _params{}; + Window _block_walker{}; + bool _b_is_pretransposed{ false }; + typename strategy::result_type _alpha{}; + typename strategy::result_type _beta{}; }; } // namespace arm_compute diff --git a/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h b/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h index e46c33018b..ba3223f66d 100644 --- a/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h +++ b/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -24,14 +24,16 @@ #ifndef __ARM_COMPUTE_NEGEMMINTERLEAVEDPREPAREBWRAPPERKERNEL_H__ #define __ARM_COMPUTE_NEGEMMINTERLEAVEDPREPAREBWRAPPERKERNEL_H__ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" #include "arm_compute/core/NEON/INEKernel.h" #include "arm_compute/core/NEON/kernels/assembly/Helpers.h" #include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" namespace arm_compute { -class ITensor; - /** Unit of work for @ref NEGEMMInterleavedPrepareBWrapperKernel to process */ struct PrepareBWorkload { @@ -56,6 +58,84 @@ struct PrepareBWorkload unsigned int _kmax; /**< Last value to process along the K dimension. */ }; +namespace detail +{ +// Call the lambda function for each workload generated by the passed window. +template +void for_each_element_in_window(const Window &window, const ITensor *b, ITensor *transformed_b, unsigned int N, unsigned int K, Lambda &&lambda) +{ + unsigned int wl_index = 0; + unsigned int num_buffers = 0, reshaped_block_size = 0; + + if(use_buffer_manager) + { + num_buffers = transformed_b->info()->tensor_shape()[1]; + reshaped_block_size = transformed_b->info()->strides_in_bytes().y(); + } + + unsigned int offset_transformed_b = transformed_b->info()->offset_first_element_in_bytes(); + execute_window_loop(window, [&](const Coordinates & coordinates) + { + const unsigned int x0 = coordinates.x(); + const unsigned int k0 = coordinates.y(); + const unsigned int multi = coordinates.z(); + + const unsigned int offset_b = b->info()->offset_element_in_bytes(Coordinates(0, 0, multi)); + const unsigned int xmax = std::min(x0 + window.x().step(), N); + const unsigned int kmax = std::min(k0 + window.y().step(), K); + + /* Figure out the size of each block. */ + unsigned int x_size = (xmax - x0); + unsigned int k_size = (kmax - k0); + + /* Round sizes up as needed. */ + x_size = ceil_to_multiple(x_size, strategy::out_width()); + k_size = ceil_to_multiple(k_size, strategy::k_unroll()); + + lambda(PrepareBWorkload(offset_b, offset_transformed_b, x0, xmax, k0, kmax)); + + //Each workload represents one block: + if(use_buffer_manager) + { + // Rotate through the BufferManager's buffers: + wl_index++; + offset_transformed_b = (wl_index % num_buffers) * reshaped_block_size; + } + else + { + offset_transformed_b += (x_size * k_size * sizeof(typename strategy::operand_type)); + } + }); +} + +// Calculate the size of transformed_b: +template +unsigned int get_B_pretransposed_array_size(unsigned int N, unsigned int K, const BlockSizes &bs, unsigned int multis) +{ + // How many full blocks do N / K contain ? + size_t num_full_k = K / bs.k_block; + size_t num_full_x = N / bs.x_block; + + ARM_COMPUTE_ERROR_ON(bs.x_block % strategy::out_width() != 0); + ARM_COMPUTE_ERROR_ON(bs.k_block % strategy::k_unroll() != 0); + + size_t normal_x_size = bs.x_block; + size_t normal_k_size = bs.k_block; + + // Round up the leftovers to be a multiple of the strategy processing size: + size_t left_over_x_size = ceil_to_multiple(N % bs.x_block, strategy::out_width()); + size_t left_over_k_size = ceil_to_multiple(K % bs.k_block, strategy::k_unroll()); + + // Calculate the total size of the buffer: + size_t total = num_full_k * normal_k_size * (num_full_x * normal_x_size + left_over_x_size); + total += left_over_k_size * (left_over_x_size + num_full_x * normal_x_size); + + total *= multis; + + return total; +} +} // namespace detail + /** Common interface for the templated wrappers around the B reshape NEON assembly implementations */ class NEGEMMInterleavedPrepareBWrapperKernel : public INEKernel { @@ -93,7 +173,7 @@ public: /** Equivalent to arm_gemm::GemmInterleaved's strategy::transforms::PrepareB() but using Compute Library types. */ -template +template class NEGEMMInterleavedPrepareBWrapperKernelTemplate : public NEGEMMInterleavedPrepareBWrapperKernel { public: @@ -105,13 +185,55 @@ public: * @param[in] ci CPU information * @param[in] params M, N, K sizes. */ - void configure(const ITensor *b, ITensor *transformed_b, bool transpose_b, const CPUInfo &ci, const INEGEMMWrapperKernel::Params ¶ms); + void configure(const ITensor *b, ITensor *transformed_b, bool transpose_b, const CPUInfo &ci, const INEGEMMWrapperKernel::Params ¶ms) + { + const unsigned int multis = b->info()->tensor_shape().z(); + _Nsize = b->info()->tensor_shape().x(); + _Ksize = b->info()->tensor_shape().y(); + _b = b; + _transformed_b = transformed_b; + _transpose_b = transpose_b; + + _block_sizes = calculate_block_sizes(ci, params.M, params.N, params.K); + + auto_init_if_empty(*transformed_b->info(), b->info()->clone()->set_tensor_shape(TensorShape{ detail::get_B_pretransposed_array_size(_Nsize, _Ksize, _block_sizes, multis) })); + + Window window; + window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_Nsize, _block_sizes.x_block), _block_sizes.x_block)); + window.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_Ksize, _block_sizes.k_block), _block_sizes.k_block)); + window.set(Window::DimZ, Window::Dimension(0, multis)); + + INEKernel::configure(window); + } // Inherited methods overridden: - void transform(const PrepareBWorkload &wl, const ThreadInfo &info) override; - void create_workloads(std::vector &workloads) override; - void run(const Window &window, const ThreadInfo &info) override; - BlockSizes block_sizes() const override; + void transform(const PrepareBWorkload &wl, const ThreadInfo &info) override + { + strategy strat(info.cpu_info); + strat.transforms.PrepareB(reinterpret_cast(_transformed_b->buffer() + wl._offset_transformed_b), + reinterpret_cast(_b->buffer() + wl._offset_b), + _b->info()->strides_in_bytes().y() / sizeof(typename strategy::operand_type), + wl._x0, wl._xmax, wl._k0, wl._kmax, _transpose_b); + } + void create_workloads(std::vector &workloads) override + { + detail::for_each_element_in_window(window(), _b, _transformed_b, _Nsize, _Ksize, [&workloads](PrepareBWorkload && wl) + { + workloads.push_back(std::move(wl)); + }); + } + void run(const Window &window, const ThreadInfo &info) override + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(window, INEKernel::window()); + detail::for_each_element_in_window(window, _b, _transformed_b, _Nsize, _Ksize, [&](PrepareBWorkload && wl) + { + this->transform(wl, info); + }); + } + BlockSizes block_sizes() const override + { + return _block_sizes; + } private: const ITensor *_b diff --git a/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h b/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h index b6831e3ca9..5d6cd02398 100644 --- a/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h +++ b/arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -25,8 +25,13 @@ #define __ARM_COMPUTE_NEGEMMINTERLEAVEDTRANSFORMAWRAPPER_H__ #include "arm_compute/core/CPP/CPPTypes.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" #include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "arm_compute/core/WindowIterator.h" namespace arm_compute { @@ -76,7 +81,7 @@ public: }; /** Type specialisations of @ref NEGEMMInterleavedTransformAWrapper */ -template +template class NEGEMMInterleavedTransformAWrapperTemplate : public NEGEMMInterleavedTransformAWrapper { public: @@ -88,11 +93,67 @@ public: * @param[in] block_walker Window representing the layout of the matrix's blocks * @param[in] params M, N, K sizes. */ - void configure(const ITensor *a, ITensor *transformed_a, bool transpose_a, const Window &block_walker, const INEGEMMWrapperKernel::Params ¶ms); + void configure(const ITensor *a, ITensor *transformed_a, bool transpose_a, const Window &block_walker, const INEGEMMWrapperKernel::Params ¶ms) + { + _a = a; + _transformed_a = transformed_a; + _transpose_a = transpose_a; + _Ksize = params.K; + _Msize = params.M; + _k_multi_window = block_walker.shift_dimensions(1); // block_walker contains (M,K,Multi) --> shift by 1 to get rid of the "M" dimension + } // Inherited methods overridden: - void transform(const TransformAWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset, const Coordinates &end_offset) override; - void create_workloads(std::vector &workloads) override; + void transform(const TransformAWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset, const Coordinates &end_offset) override + { + strategy strat(info.cpu_info); + TensorAccessor a(*_a); + TensorAccessor transformed_a(*_transformed_a); + + if(_a->info()->data_layout() == DataLayout::NHWC) + { + // In the case of NHWC we want to interpret the output shape as 3D. Thus, the batch stride for A is + // the relevant multiple of the row stride. + const size_t nhwc_batch_stride = _a->info()->strides_in_bytes().y() * _Msize; + a.set_stride(2, nhwc_batch_stride); + } + + unsigned int last_m = 0; + //TODO: Create a new iterate_1D( DimY); + int last_y = -1; + auto window_iterator = arm_compute::create_window_iterator(batch_window, start_offset, end_offset, [&](const Coordinates & id) + { + if(id.y() != last_y) + { + last_y = id.y(); + unsigned int batch = id.y(); + unsigned int first_m = id.x(); + + if(first_m >= last_m) + return; + + strat.transforms.PrepareA(transformed_a(0, first_m, batch), + a(0, 0, batch, wl._multi), + a.stride(1), first_m, last_m, wl._k0, wl._kmax, _transpose_a); + } + }); + auto on_new_row_size = [&](unsigned int start, unsigned int end) + { + last_m = std::min(end, _Msize); + }; + window_iterator.iterate_2D(on_new_row_size); + } + void create_workloads(std::vector &workloads) override + { + execute_window_loop(_k_multi_window, [&](const Coordinates & id) + { + const unsigned int k0 = id.x(); + const unsigned int multi = id.y(); + const unsigned int kmax = std::min(k0 + _k_multi_window.x().step(), _Ksize); + + workloads.push_back(TransformAWorkload(k0, kmax, multi)); + }); + } private: const ITensor *_a diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp index 162cbc5c46..26c1f3df89 100644 --- a/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp +++ b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -24,6 +24,7 @@ #pragma once #include +#include #include "arm_gemm_local.hpp" #include "gemm_common.hpp" @@ -37,45 +38,57 @@ enum class GemmMethod GEMV_PRETRANSPOSED, GEMV_NATIVE_TRANSPOSED, GEMM_NATIVE, - GEMM_INTERLEAVED, - GEMM_INTERLEAVED_FP16, - GEMM_INTERLEAVED_DOT + GEMM_HYBRID, + GEMM_INTERLEAVED +}; + + +struct KernelDescription +{ + GemmMethod method = GemmMethod::DEFAULT; + std::string name = ""; + + KernelDescription(GemmMethod m, std::string n) : method(m), name(n) { } + KernelDescription() { } }; struct GemmConfig { - GemmMethod method = GemmMethod::DEFAULT; + GemmMethod method = GemmMethod::DEFAULT; + std::string filter = ""; unsigned int inner_block_size = 0; unsigned int outer_block_size = 0; GemmConfig(GemmMethod method) : method(method) { } + GemmConfig() { } }; template struct GemmArgs { public: - const CPUInfo *_ci; - unsigned int _Msize; - unsigned int _Nsize; - unsigned int _Ksize; - unsigned int _nbatches; - unsigned int _nmulti; - bool _trA; - bool _trB; - T _alpha; - T _beta; - int _maxthreads; - bool _pretransposed_hint; + const CPUInfo *_ci; + unsigned int _Msize; + unsigned int _Nsize; + unsigned int _Ksize; + unsigned int _nbatches; + unsigned int _nmulti; + bool _trA; + bool _trB; + T _alpha; + T _beta; + int _maxthreads; + bool _pretransposed_hint; + const GemmConfig *_cfg; GemmArgs(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const unsigned int nbatches, const unsigned int nmulti, const bool trA, const bool trB, const T alpha, const T beta, const int maxthreads, - const bool pretransposed_hint) : - _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti), - _trA(trA), _trB(trB), _alpha(alpha), _beta(beta), _maxthreads(maxthreads), - _pretransposed_hint(pretransposed_hint) + const bool pretransposed_hint, const GemmConfig *cfg=nullptr ) : + _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti), + _trA(trA), _trB(trB), _alpha(alpha), _beta(beta), _maxthreads(maxthreads), + _pretransposed_hint(pretransposed_hint), _cfg(cfg) { } }; @@ -90,7 +103,7 @@ using UniqueGemmCommon = std::unique_ptr >; * provided parameters be provided using the supplied method? */ template -bool method_is_compatible(GemmMethod method, GemmArgs &args); +bool method_is_compatible(GemmMethod method, const GemmArgs &args); template bool method_is_compatible(GemmMethod method, const CPUInfo &ci, @@ -107,14 +120,14 @@ bool method_is_compatible(GemmMethod method, const CPUInfo &ci, /* get_gemm_method(): Given the templated types and provided parameters, * which is the preferred method to implement this GEMM? */ template -GemmMethod get_gemm_method(GemmArgs &args); +KernelDescription get_gemm_method(const GemmArgs &args); template -GemmMethod get_gemm_method(const CPUInfo &ci, - const unsigned int M, const unsigned int N, const unsigned int K, - const unsigned int nbatches, const unsigned int nmulti, - const bool trA, const bool trB, const Tret alpha, const Tret beta, - const int maxthreads, const bool pretransposed_hint) +KernelDescription get_gemm_method(const CPUInfo &ci, + const unsigned int M, const unsigned int N, const unsigned int K, + const unsigned int nbatches, const unsigned int nmulti, + const bool trA, const bool trB, const Tret alpha, const Tret beta, + const int maxthreads, const bool pretransposed_hint) { GemmArgs args(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint); @@ -122,7 +135,7 @@ GemmMethod get_gemm_method(const CPUInfo &ci, } template -UniqueGemmCommon gemm(GemmArgs &args, GemmConfig *cfg); +UniqueGemmCommon gemm(const GemmArgs &args); /** Request an object to process a GEMM. * @@ -146,10 +159,25 @@ UniqueGemmCommon gemm(const CPUInfo &ci, const unsigned int nbatches, const unsigned int nmulti, const bool trA, const bool trB, const Tret alpha, const Tret beta, const int maxthreads, const bool pretransposed_hint, GemmConfig *cfg=nullptr) +{ + GemmArgs args(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint, cfg); + + return gemm(args); +} + +template +std::vector get_compatible_kernels(const GemmArgs &args); + +template +std::vector get_compatible_kernels(const CPUInfo &ci, + const unsigned int M, const unsigned int N, const unsigned int K, + const unsigned int nbatches, const unsigned int nmulti, + const bool trA, const bool trB, const Tret alpha, const Tret beta, + const int maxthreads, const bool pretransposed_hint) { GemmArgs args(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint); - return gemm(args, cfg); + return get_compatible_kernels(args); } } // namespace arm_gemm diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp index b43d6eaca6..7b4f0149e3 100644 --- a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp +++ b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -88,11 +88,11 @@ public: * This has an empty default implementation, as GEMMs which don't care * about thread count can safely ignore this. */ - virtual void set_nthreads(int nthreads) { }; + virtual void set_nthreads(int) { }; /* Actually do the work. Provide a threadid to index any per-thread * buffers, and a start/end range to indicate which work to do. */ - virtual void execute(unsigned int start, unsigned int end, int threadid) = 0; + virtual void execute(unsigned int, unsigned int, int) = 0; /*** Working space interface (optional) ***/ /* Total number of bytes of temporary working space needed. If zero, it's not necessary to call set_working_space(). */ @@ -108,9 +108,10 @@ public: /* Total number of bytes of space needed for pretransposed arrays. */ virtual size_t get_B_pretransposed_array_size() const { return 0; } /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */ - virtual void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) { }; + /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */ + virtual void pretranspose_B_array(void *, const To *, const int, const int) { }; /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */ - virtual void set_pretransposed_B_data(void *buffer) { } + virtual void set_pretransposed_B_data(void *) { } // Destructor virtual ~GemmCommon() { } diff --git a/arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h b/arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h index 26236ffb35..3ccfbc512b 100644 --- a/arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h +++ b/arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,9 @@ #include "arm_compute/core/NEON/kernels/assembly/Helpers.h" #include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h" +#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h" +#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h" +#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IScheduler.h" @@ -36,13 +39,8 @@ namespace arm_compute { +// Forward declarations class ITensor; -class NEGEMMInterleavedPrepareBWrapperKernel; -class PrepareBWorkload; -class TransformAWorkload; -class MatrixMultiplyWorkload; -class NEGEMMInterleavedTransformAWrapper; -class NEGEMMInterleavedMatrixMultiplyWrapper; /** Buffer manager used when reshaping B on the fly * @@ -97,6 +95,7 @@ class NEGEMMInterleavedWrapper : public IFunction { public: NEGEMMInterleavedWrapper(std::shared_ptr memory_manager = nullptr); + ~NEGEMMInterleavedWrapper() = default; NEGEMMInterleavedWrapper(const NEGEMMInterleavedWrapper &) = delete; NEGEMMInterleavedWrapper &operator=(const NEGEMMInterleavedWrapper &) = delete; @@ -111,9 +110,8 @@ public: * @param[in] alpha Scalar multiplier to apply to AB matrix product. * @param[in] beta Scalar multiplier to apply to input C matrix before adding product. * @param[in] pretranspose_b If true, pretranspose B once during the prepare() stage instead of on the fly every time. - * @param[in] use_dot (Optional) If the input's type is U8/S8/QASYMM8 then use the dot product flavour or the matrix multiply routine. (Must be supported by the hardware). */ - void configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, bool pretranspose_b, bool use_dot = false); + void configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, bool pretranspose_b); // Inherited methods overridden: void run() override; @@ -143,6 +141,5 @@ private: std::vector _workloads{}; std::string _tag{}; }; - } // namespace arm_compute #endif /* __ARM_COMPUTE_NEGEMMINTERLEAVEDWRAPPER_H__ */ diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp index 9194bdd4d4..1a90e96140 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -38,75 +38,51 @@ namespace arm_gemm { -#ifdef __ARM_FEATURE_SVE -class GemmImpl_gemm_fp16_interleaved_fp16 : public GemmImplementation<__fp16, __fp16> { -public: - - UniqueGemmCommon<__fp16, __fp16> instantiate(const GemmArgs<__fp16> &args) override { - return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved(args)); - } - - GemmImpl_gemm_fp16_interleaved_fp16() : GemmImplementation<__fp16, __fp16>(GemmMethod::GEMM_INTERLEAVED_FP16) { } -}; - -#elif defined(__aarch64__) - -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS) -class GemmImpl_gemm_fp16_interleaved_fp16 : public GemmImplementation<__fp16, __fp16> { -public: +static const GemmImplementation<__fp16, __fp16> gemm_fp16_methods[] = { +#if defined(__ARM_FEATURE_SVE) +{ + GemmMethod::GEMM_INTERLEAVED, + "interleaved_fp16_mla_3VLx8", + [](const GemmArgs<__fp16> &args) { return (args._Ksize > 4); }, + [](const GemmArgs<__fp16> &args) { return true; }, + [](const GemmArgs<__fp16> &args) { return new GemmInterleaved(args); } +}, +#endif +#if defined(__aarch64__) && (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS)) +{ + GemmMethod::GEMM_INTERLEAVED, + "hgemm_24x8", + [](const GemmArgs<__fp16> &args) { #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - bool is_supported(const GemmArgs<__fp16> &args) override { return args._ci->has_fp16(); - } -#endif - - UniqueGemmCommon<__fp16, __fp16> instantiate(const GemmArgs<__fp16> &args) override { - return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved(args)); - } - - GemmImpl_gemm_fp16_interleaved_fp16() : GemmImplementation<__fp16, __fp16>(GemmMethod::GEMM_INTERLEAVED_FP16) { } -}; -#endif - -#endif // __aarch64__ - -class GemmImpl_gemm_fp16_interleaved : public GemmImplementation<__fp16, __fp16> { -public: - UniqueGemmCommon<__fp16, __fp16> instantiate(const GemmArgs<__fp16> &args) override { -#ifdef __aarch64__ - return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved(args)); -#elif defined(__arm__) - return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved(args)); #else -# error Unknown Architecture + return true; #endif - } - - GemmImpl_gemm_fp16_interleaved() : GemmImplementation<__fp16, __fp16>(GemmMethod::GEMM_INTERLEAVED) { } -}; - -#if defined(__aarch64__) && (defined(__ARM_FEATURE_VECTOR_ARITHMETIC) || defined(FP16_KERNELS) || defined(__ARM_FEATURE_SVE)) -static GemmImpl_gemm_fp16_interleaved_fp16 gemm_fp16_interleaved_fp16_impl{}; -#endif -static GemmImpl_gemm_fp16_interleaved gemm_fp16_interleaved_impl{}; - -static std::vector *> gemm_fp16_methods = { -#if defined(__aarch64__) && (defined(__ARM_FEATURE_VECTOR_ARITHMETIC) || defined(FP16_KERNELS) || defined(__ARM_FEATURE_SVE)) - &gemm_fp16_interleaved_fp16_impl, + }, + [](const GemmArgs<__fp16> &args) { return true; }, + [](const GemmArgs<__fp16> &args) { return new GemmInterleaved(args); } +}, #endif - &gemm_fp16_interleaved_impl +{ + GemmMethod::DEFAULT, + "", + nullptr, + nullptr, + nullptr, +} }; template<> -std::vector *> &gemm_implementation_list<__fp16, __fp16>() { +const GemmImplementation<__fp16, __fp16> *gemm_implementation_list<__fp16, __fp16>() { return gemm_fp16_methods; } /* Explicitly instantiate the external functions for these types. */ -template UniqueGemmCommon<__fp16, __fp16> gemm<__fp16, __fp16>(GemmArgs<__fp16> &args, GemmConfig *cfg); -template GemmMethod get_gemm_method<__fp16, __fp16>(GemmArgs<__fp16> &args); -template bool method_is_compatible<__fp16, __fp16>(GemmMethod method, GemmArgs<__fp16> &args); +template UniqueGemmCommon<__fp16, __fp16> gemm<__fp16, __fp16>(const GemmArgs<__fp16> &args); +template KernelDescription get_gemm_method<__fp16, __fp16>(const GemmArgs<__fp16> &args); +template bool method_is_compatible<__fp16, __fp16>(GemmMethod method, const GemmArgs<__fp16> &args); +template std::vector get_compatible_kernels<__fp16, __fp16> (const GemmArgs<__fp16> &args); } // namespace arm_gemm -#endif // __ARM_FP16_ARGS +#endif // __ARM_FP16_ARGS \ No newline at end of file diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp index 7d14971b70..8bc33ccb69 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -23,6 +23,7 @@ */ #include "arm_gemm.hpp" #include "gemm_common.hpp" +#include "gemm_hybrid.hpp" #include "gemm_implementation.hpp" #include "gemm_interleaved.hpp" #include "gemm_native.hpp" @@ -30,112 +31,140 @@ #include "gemv_native_transposed.hpp" #include "gemv_pretransposed.hpp" -#include "kernels/a64_sgemm_12x8.hpp" #include "kernels/a32_sgemm_8x6.hpp" -#include "kernels/a64_sgemv_trans.hpp" -#include "kernels/a64_sgemv_pretransposed.hpp" +#include "kernels/a64_sgemm_12x8.hpp" #include "kernels/a64_sgemm_native_16x4.hpp" +#include "kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp" +#include "kernels/a64_sgemv_pretransposed.hpp" +#include "kernels/a64_sgemv_trans.hpp" +#include "kernels/sve_hybrid_fp32_mla_4VLx4.hpp" #include "kernels/sve_interleaved_fp32_mla_3VLx8.hpp" +#include "kernels/sve_native_fp32_mla_4VLx4.hpp" +#include "kernels/sve_smallK_fp32_mla_1VLx4.hpp" +#include "kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp" namespace arm_gemm { -#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE) -// SGEMM implementations for AArch64 without SVE - -// Pretransposed GEMV -class GemmImpl_sgemm_gemv_pretransposed : public GemmImplementation { -public: - bool is_supported(const GemmArgs &args) override { - return (args._Msize==1 && args._alpha==1.0f && args._pretransposed_hint && args._nbatches==1); - } +static const GemmImplementation gemm_fp32_methods[] = +{ +{ + GemmMethod::GEMV_BATCHED, + "gemv_batched", + [](const GemmArgs &args) { return (args._Msize==1) && (args._nbatches>1); }, + nullptr, + [](const GemmArgs &args) { return new GemvBatched(args); } +}, +#ifdef __aarch64__ +{ + GemmMethod::GEMV_PRETRANSPOSED, + "sgemv_pretransposed", + [](const GemmArgs &args) { return (args._Msize==1 && args._alpha==1.0f && args._pretransposed_hint && args._nbatches==1); }, + nullptr, + [](const GemmArgs &args) { return new GemvPretransposed(args); } +}, +{ + GemmMethod::GEMV_NATIVE_TRANSPOSED, + "sgemv_trans", + [](const GemmArgs &args) { return (args._Msize==1 && args._alpha==1.0f && !args._trA && !args._trB && args._nbatches==1); }, + nullptr, + [](const GemmArgs &args) { return new GemvNativeTransposed(args); } +}, - UniqueGemmCommon instantiate(const GemmArgs &args) override { - return UniqueGemmCommon (new GemvPretransposed(args._ci, args._Nsize, args._Ksize, args._nmulti, args._trB, args._beta)); - } - - GemmImpl_sgemm_gemv_pretransposed() : GemmImplementation(GemmMethod::GEMV_PRETRANSPOSED) { } -}; - -// Native GEMV -class GemmImpl_sgemm_gemv_native_transposed : public GemmImplementation { -public: - bool is_supported(const GemmArgs &args) override { - return (args._Msize==1 && args._alpha==1.0f && !args._trA && !args._trB && args._nbatches==1); - } - - UniqueGemmCommon instantiate(const GemmArgs &args) override { - return UniqueGemmCommon (new GemvNativeTransposed(args._ci, args._Nsize, args._Ksize, args._nmulti, args._beta)); - } - - GemmImpl_sgemm_gemv_native_transposed() : GemmImplementation(GemmMethod::GEMV_NATIVE_TRANSPOSED) { } -}; - -// Native GEMM -class GemmImpl_sgemm_gemm_native : public GemmImplementation { -public: - bool is_supported(const GemmArgs &args) override { - return (args._Ksize>4 && (args._Nsize % 16)==0 && args._alpha==1.0f && !args._trA && !args._trB); - } - - bool is_recommended(const GemmArgs &args) override { - return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); - } - - UniqueGemmCommon instantiate(const GemmArgs &args) override { - return UniqueGemmCommon (new GemmNative(args._ci, args._Msize, args._Nsize, args._Ksize, args._nbatches, args._nmulti, args._beta)); - } - - GemmImpl_sgemm_gemm_native() : GemmImplementation(GemmMethod::GEMM_NATIVE) { } -}; -#endif // __aarch64__ - -// Interleaved GEMM -class GemmImpl_sgemm_gemm_interleaved : public GemmImplementation { -public: - UniqueGemmCommon instantiate(const GemmArgs &args) override { #ifdef __ARM_FEATURE_SVE - return UniqueGemmCommon (new GemmInterleaved(args)); -#elif defined(__aarch64__) - return UniqueGemmCommon (new GemmInterleaved(args)); -#elif defined(__arm__) - return UniqueGemmCommon (new GemmInterleaved(args)); -#else -# error Unknown Architecture. -#endif - } - - GemmImpl_sgemm_gemm_interleaved() : GemmImplementation(GemmMethod::GEMM_INTERLEAVED) { } -}; + // SVE smallk / native / hybrid methods +{ + GemmMethod::GEMM_HYBRID, + "smallK_hybrid_fp32_mla_1VLx4", + [](const GemmArgs &args) { return (args._Ksize <= 24) && !args._trA && args._alpha==1.0f && args._pretransposed_hint; }, + nullptr, + [](const GemmArgs &args) { return new GemmHybrid(args); } +}, +{ + GemmMethod::GEMM_HYBRID, + "hybrid_fp32_mla_4VLx4", + [](const GemmArgs &args) { return (args._Ksize >= 4) && (args._alpha == 1.0f) && !args._trA && args._pretransposed_hint; }, + [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, + [](const GemmArgs &args) { return new GemmHybrid(args); } +}, +{ + GemmMethod::GEMM_NATIVE, + "smallK_fp32_mla_1VLx4", + [](const GemmArgs &args) { return (args._Ksize <= 24) && !args._trA && !args._trB && args._alpha==1.0f; }, + nullptr, + [](const GemmArgs &args) { return new GemmNative(args); } +}, +{ + GemmMethod::GEMM_NATIVE, + "native_fp32_mla_4VLx4", + [](const GemmArgs &args) { return (args._Ksize>4 && args._alpha==1.0f && !args._trA && !args._trB); }, + [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, + [](const GemmArgs &args) { return new GemmNative(args); } +}, +#endif // __ARM_FEATURE_SVE + +// NEON native / hybrid methods +{ + GemmMethod::GEMM_HYBRID, + "sgemm_nativeA_pretransposeB_16x4", + [](const GemmArgs &args) { return (args._Ksize >= 4) && (args._alpha == 1.0f) && !args._trA && args._pretransposed_hint; }, + [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, + [](const GemmArgs &args) { return new GemmHybrid(args); } +}, +{ + GemmMethod::GEMM_NATIVE, + "sgemm_native_16x4", + [](const GemmArgs &args) { return (args._Ksize>4 && (args._Nsize % 16)==0 && args._alpha==1.0f && !args._trA && !args._trB); }, + [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, + [](const GemmArgs &args) { return new GemmNative(args); } +}, -static GemmImpl_gemv_batched gemv_batched_impl{}; -#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE) -static GemmImpl_sgemm_gemv_pretransposed sgemm_gemv_pretransposed_impl{}; -static GemmImpl_sgemm_gemv_native_transposed sgemm_gemv_native_transposed_impl{}; -static GemmImpl_sgemm_gemm_native sgemm_gemm_native_impl{}; -#endif -static GemmImpl_sgemm_gemm_interleaved sgemm_gemm_interleaved_impl{}; +#ifdef __ARM_FEATURE_SVE + { + GemmMethod::GEMM_INTERLEAVED, + "interleaved_fp32_mla_3VLx8", + [](const GemmArgs &args) { return (args._Ksize>4); }, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +#endif // __ARM_FEATURE_SVE +{ + GemmMethod::GEMM_INTERLEAVED, + "sgemm_12x8", + nullptr, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +#endif // __aarch64__ -/* List of implementations (order matters) */ -static std::vector *> SGemmMethods = { - &gemv_batched_impl, -#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE) - &sgemm_gemv_pretransposed_impl, - &sgemm_gemv_native_transposed_impl, - &sgemm_gemm_native_impl, -#endif - &sgemm_gemm_interleaved_impl +#ifdef __arm__ + { + GemmMethod::GEMM_INTERLEAVED, + "sgemm_8x6", + nullptr, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +#endif // __arm__ +{ + GemmMethod::DEFAULT, + "", + nullptr, + nullptr, + nullptr +} }; /* Templated function to return this list. */ template<> -std::vector *> &gemm_implementation_list() { - return SGemmMethods; +const GemmImplementation *gemm_implementation_list() { + return gemm_fp32_methods; } /* Explicitly instantiate the external functions for these types. */ -template UniqueGemmCommon gemm(GemmArgs &args, GemmConfig *cfg); -template GemmMethod get_gemm_method(GemmArgs &args); -template bool method_is_compatible(GemmMethod method, GemmArgs &args); +template UniqueGemmCommon gemm(const GemmArgs &args); +template KernelDescription get_gemm_method(const GemmArgs &args); +template bool method_is_compatible(GemmMethod method, const GemmArgs &args); +template std::vector get_compatible_kernels (const GemmArgs &args); -} // namespace arm_gemm +} // namespace arm_gemm \ No newline at end of file diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp new file mode 100644 index 0000000000..09f03c6332 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp @@ -0,0 +1,303 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include + +#include + +#include "arm_gemm.hpp" +#include "utils.hpp" + +#include "mergeresults.hpp" +#include "transform.hpp" + +#ifdef CYCLE_PROFILING +#include "profiler.hpp" +#endif + +namespace arm_gemm { + +// Implementation of the GemmCommon abstract class. +template +class GemmHybrid : public GemmCommon { + typedef typename strategy::operand_type Toi; + typedef typename strategy::result_type Tri; + + /* const properties set by constructor */ + const CPUInfo * const _ci; + + const unsigned int _Msize; + const unsigned int _Nsize; + const unsigned int _Ksize; + + const unsigned int _nbatches; + const unsigned int _nmulti; + + const bool _trB; + + const Tr _beta; + + /* Blocking info */ + unsigned int _k_block=0; + unsigned int _x_block=0; + unsigned int _Mround=0; + + /* Pretransposed buffer. */ + const Toi *_B_transposed=nullptr; + + unsigned int _B_per_multi = 0; + + /* We will need to walk through the blocks of B in a few contexts, so + * factor that out. */ + class blockwalker { + private: + /* Size loops, etc. based on our parent's configuration */ + const GemmHybrid &_parent; + + /* K, X and multi parameters for current iteration. */ + unsigned int _k0=0, _x0=0; + + unsigned int _index=0; + bool _done=false; + bool _newkblock=true; + + public: + blockwalker(const GemmHybrid &parent) : _parent(parent) { } + + unsigned int xmax() { + return std::min(_x0 + _parent._x_block, _parent._Nsize); + } + + unsigned int kmax() { + return std::min(_k0 + _parent._k_block, _parent._Ksize); + } + + /* Advance to the next block, return false at the end. */ + bool advance(void) { + if (_done) { + return false; + } + + _newkblock=false; + _x0 += _parent._x_block; + if (_x0 >= _parent._Nsize) { + _x0=0; + _k0 += _parent._k_block; + if (_k0 >= _parent._Ksize) { + _done=true; + return false; + } + _newkblock=true; + } + _index++; + + return true; + } + + unsigned int k0(void) { return _k0; } + unsigned int x0(void) { return _x0; } + unsigned int index(void) { return _index; } + bool done(void) { return _done; } + bool newkblock(void) { return _newkblock; } + }; + + +public: + GemmHybrid(GemmHybrid &) = delete; + GemmHybrid & operator= (GemmHybrid &) = delete; + + /* Constructor */ + GemmHybrid(const GemmArgs &args) + : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), _nbatches(args._nbatches), + _nmulti(args._nmulti), _trB(args._trB), _beta(args._beta) { + const unsigned int L1_size = _ci->get_L1_cache_size(); + const unsigned int L2_size = _ci->get_L2_cache_size(); + + _B_per_multi = (iceildiv(_Nsize, strategy::out_width()) * strategy::out_width()) * + (iceildiv(_Ksize, strategy::k_unroll()) * strategy::k_unroll()); + + // Work out blocking parameters, or override from config. + + if (args._cfg && args._cfg->inner_block_size) { + _k_block = args._cfg->inner_block_size; + } else { + // k_block: Find out how much of the larger array can be loaded into half the cache. + // This should account for associative caches. + _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); + + // Needs to be (at least a single) multiple of the K unroll level. + _k_block /= strategy::k_unroll(); + _k_block = std::max(_k_block, 1U) * strategy::k_unroll(); + + // Now tune to presented problem size; this is how many blocks we need. + int num_k_blocks = iceildiv(_Ksize, _k_block); + + // So divide the space equally into that many blocks. + _k_block = iceildiv(_Ksize, num_k_blocks); + + // And round UP to the K unroll level required. + _k_block = iceildiv(_k_block, strategy::k_unroll()); + _k_block *= strategy::k_unroll(); + } + + if (args._cfg && args._cfg->outer_block_size) { + _x_block = args._cfg->outer_block_size; + } else { + // x_block: Work out how many rows (of length k_block) will fit in the L2 + // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. + _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / + (sizeof(Toi) * _k_block); + + // Needs to be (at least a single) multiple of the kernel output width. + _x_block /= strategy::out_width(); + _x_block = std::max(_x_block, 1U) * strategy::out_width(); + + // And tune to the presented problem size. + int num_x_blocks = iceildiv(_Nsize, _x_block); + _x_block = iceildiv(_Nsize, num_x_blocks); + + _x_block = iceildiv(_x_block, strategy::out_width()); + _x_block *= strategy::out_width(); + } + + // Work out the rounded size of M - needed for some buffers. + _Mround = iceildiv(_Msize, strategy::out_height()); + _Mround *= strategy::out_height(); + } + + // Interface implementation - Compulsory functions + + // Window size: Only the last thread should do a ragged block, so dole + // out work in units of out_height. Factor batches and multi into the + // window too. + unsigned int get_window_size() const override { + // _Mround is a multiple of out_height by definition. + return (_Mround / strategy::out_height()) * _nbatches * _nmulti; + } + + // Execute + void execute(unsigned int start, unsigned int end, int threadid) override { +#ifdef CYCLE_PROFILING + profiler prof; +#endif + strategy strat(_ci); + + /* Make sure we've been set up correctly. */ + assert(_B_transposed); + + const unsigned int window_per_batch = iceildiv(_Msize, strategy::out_height()); + const unsigned int window_per_multi = window_per_batch * _nbatches; + + const unsigned int first_multi = start / window_per_multi; + const unsigned int last_multi = end / window_per_multi; + + const unsigned int first_batch = (start - (first_multi * window_per_multi)) / window_per_batch; + const unsigned int last_batch = (end - (last_multi * window_per_multi)) / window_per_batch; + + const unsigned int first_row = ((start - (first_multi * window_per_multi)) % window_per_batch) * strategy::out_height(); + const unsigned int last_row = ((end - (last_multi * window_per_multi)) % window_per_batch) * strategy::out_height(); + + static_assert(std::is_same::value, "gemm_native: Operand types must be the same."); + static_assert(std::is_same::value, "gemm_native: Result types must be the same."); + + for (unsigned int multi = first_multi; multi <= last_multi; multi++) { + const unsigned int batch_0 = (multi == first_multi) ? first_batch : 0; + const unsigned int batch_max = (multi == last_multi) ? last_batch : (_nbatches - 1); + + const Toi *b_panel = _B_transposed + (multi * _B_per_multi); + + for (blockwalker current(*this); !current.done(); current.advance()) { + int kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll()); + kern_k *= strat.k_unroll(); + + int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width()); + + for (unsigned int batch = batch_0; batch <= batch_max; batch++) { + const unsigned int m_start = ((multi == first_multi) && (batch == first_batch)) ? first_row : 0; + const unsigned int m_end = ((multi == last_multi) && (batch == last_batch) ) ? last_row : _Msize; +#ifdef CYCLE_PROFILING + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * bblocks * strategy::out_width()); +#endif + + strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + current.k0(), this->_lda, + b_panel, + this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + current.x0(), this->_ldc, + (current.k0() == 0) ? _beta : static_cast(1), + (m_end - m_start), (current.xmax() - current.x0()), kern_k); + } + + b_panel += (bblocks * strat.out_width() * kern_k); + } + } + } + + // Interface implementation - pretransposed + bool B_is_pretransposed() const override { + return true; + } + + bool B_pretranspose_required() const override { + return (_B_transposed==nullptr); + } + + size_t get_B_pretransposed_array_size() const override { + return _B_per_multi * _nmulti * sizeof(Toi); + } + + void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { + Toi *buffer = reinterpret_cast(in_buffer); + _B_transposed = buffer; + strategy strat(_ci); + + for (unsigned int multi=0; multi < _nmulti; multi++) { + blockwalker current(*this); + + do { + /* Figure out the size of each block. */ + size_t x_size = (current.xmax() - current.x0()); + size_t k_size = (current.kmax() - current.k0()); + + /* Round sizes up as needed. */ + x_size = iceildiv(x_size, strategy::out_width()); + x_size *= strategy::out_width(); + + k_size = iceildiv(k_size, strategy::k_unroll()); + k_size *= strategy::k_unroll(); + + strat.transforms.PrepareB( + buffer, B + (multi * B_multi_stride), ldb, + current.x0(), current.xmax(), current.k0(), current.kmax(), _trB); + + buffer += (x_size * k_size); + } while (current.advance()); + } + } + + void set_pretransposed_B_data(void *in_buffer) override { + _B_transposed = reinterpret_cast(in_buffer); + } +}; + +} // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp index 6734e3cce0..bf80784b79 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -22,56 +22,53 @@ * SOFTWARE. */ -#include "gemv_batched.hpp" +#include -namespace arm_gemm { - -template -class GemmImplementation { -public: - /* Is this implementation compatible with the args as provided? */ - virtual bool is_supported(const GemmArgs &args) { return true; } - /* Is this implementation "recommended" for these args (heuristic)? */ - virtual bool is_recommended(const GemmArgs &args) { return true; } - /* Instantiate this method please. */ - virtual UniqueGemmCommon instantiate(const GemmArgs &args) = 0; +#include - /* Indicate the "GemmMethod" for use as a selector */ - const GemmMethod method; - - virtual ~GemmImplementation() { } - - GemmImplementation(GemmMethod method) : method(method) { } -}; +namespace arm_gemm { -/* "gemv_batched" implementation is type-agnostic, so template it here. */ template -class GemmImpl_gemv_batched : public GemmImplementation { -public: - bool is_supported(const GemmArgs &args) override { - return (args._Msize==1 && args._nbatches > 1); - } - - UniqueGemmCommon instantiate(const GemmArgs &args) override { - return UniqueGemmCommon (new GemvBatched(args)); - } - - GemmImpl_gemv_batched() : GemmImplementation(GemmMethod::GEMV_BATCHED) { } +struct GemmImplementation { + const GemmMethod method; + const char * name; + std::function &)> is_supported; + std::function &)> is_recommended; + std::function *(const GemmArgs &)> instantiate; }; /* "Master" function implemented for each valid combination of types. * Returns a list of GEMM implementation descriptors for processing by the - * other functions. */ + * other functions, terminated by an implementation with + * method==GemmMethod::DEFAULT. */ template -std::vector *> &gemm_implementation_list(); +const GemmImplementation *gemm_implementation_list(); +/* + * Select a GEMM implementation for the given arguments. + * + * The logic here returns the first method on the list which supports the + * requested problem parameters, matches the provided filters (method and/or + * name string match) and recommends itself. + * + * If there is no such method, it will return the first method which + * supports the requested parameters and passes the filters, regardless of + * recommendation. + * + * If no method supports the requested parameters and passes the filters, + * this function returns false and doesn't touch the provided pointer + * reference. + */ template -GemmImplementation *find_implementation(GemmArgs &args, GemmConfig *cfg) { +bool find_implementation(const GemmArgs &args, const GemmImplementation * &impl) { auto gemms = gemm_implementation_list(); + const GemmConfig *cfg = args._cfg; - for(auto &&i : gemms) { + const GemmImplementation *saved_impl = nullptr; + + for (auto i = gemms; i->method != GemmMethod::DEFAULT; i++) { /* Skip if this implementation doesn't support these args. */ - if (!i->is_supported(args)) { + if (i->is_supported != nullptr && !i->is_supported(args)) { continue; } @@ -80,52 +77,92 @@ GemmImplementation *find_implementation(GemmArgs &args, GemmCon continue; } - /* If no specific method is requested, check that this method recommends itself. */ - if ((!cfg || cfg->method == GemmMethod::DEFAULT) && !i->is_recommended(args)) { + /* Skip if a filter is to be applied and it doesn't match. */ + if (cfg && cfg->filter != "" && !strstr(i->name, cfg->filter.c_str())) { + continue; + } + + /* At this point, if we don't have a saved implementation, save this + * one. This is so that we always return something if a filter + * matches, even if it doesn't recommend itself. + */ + if (saved_impl == nullptr) { + saved_impl=i; + } + + /* Check that this method recommends itself. */ + if (i->is_recommended != nullptr && !i->is_recommended(args)) { + continue; + } + + impl=i; + + return true; + } + + /* We didn't find an option matching the filters that recommended + * itself. But if we found something earlier that matched the filters + * but wasn't recommended, return it here. */ + if (saved_impl != nullptr) { + impl = saved_impl; + return true; + } + + return false; +} + +template +std::vector get_compatible_kernels(const GemmArgs &args) { + std::vector res; + + auto gemms = gemm_implementation_list(); + + for (auto i = gemms; i->method != GemmMethod::DEFAULT; i++) { + /* Check that this implementation supports the presented problem. */ + if (i->is_supported != nullptr && !i->is_supported(args)) { continue; } - return i; + res.push_back(i->name); } - return nullptr; + return res; } template -UniqueGemmCommon gemm(GemmArgs &args, GemmConfig *cfg) { - auto impl = find_implementation(args, cfg); +UniqueGemmCommon gemm(const GemmArgs &args) { + const GemmImplementation *impl; - if (impl) { - return impl->instantiate(args); + if (find_implementation(args, impl)) { + return UniqueGemmCommon(impl->instantiate(args)); } return UniqueGemmCommon(nullptr); } template -GemmMethod get_gemm_method(GemmArgs &args) { - auto impl = find_implementation(args, nullptr); +KernelDescription get_gemm_method(const GemmArgs &args) { + const GemmImplementation *impl; - if (impl) { - return impl->method; + if (find_implementation(args, impl)) { + return KernelDescription(impl->method, impl->name); } /* This shouldn't happen - there should always be at least one valid implementation. */ - return GemmMethod::DEFAULT; + return KernelDescription(); } template -bool method_is_compatible(GemmMethod method, GemmArgs &args) { +bool method_is_compatible(GemmMethod method, const GemmArgs &args) { /* Determine if the method is valid by attempting to obtain an implementation specifying this method. */ - GemmConfig cfg(method); + GemmConfig cfg(method); + GemmArgs myargs = args; - auto impl = find_implementation(args, &cfg); + myargs._cfg = &cfg; - if (impl) { - return true; - } + const GemmImplementation *impl; - return false; + return find_implementation(myargs, impl); } -} // namespace arm_gemm +} // namespace arm_gemm \ No newline at end of file diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp index ad171a7f9a..b4503dd6a2 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -32,30 +32,33 @@ namespace arm_gemm { -class GemmImpl_gemm_s16_interleaved : public GemmImplementation { -public: - UniqueGemmCommon instantiate(const GemmArgs &args) override { - return UniqueGemmCommon(new GemmInterleaved(args)); - } - - GemmImpl_gemm_s16_interleaved() : GemmImplementation(GemmMethod::GEMM_INTERLEAVED) { } -}; - -static GemmImpl_gemm_s16_interleaved gemm_s16_interleaved_impl{}; - -static std::vector *> gemm_s16_methods = { - &gemm_s16_interleaved_impl +static const GemmImplementation gemm_s16_methods[] = { +{ + GemmMethod::GEMM_INTERLEAVED, + "gemm_s16_12x8", + nullptr, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +{ + GemmMethod::DEFAULT, + "", + nullptr, + nullptr, + nullptr +} }; template<> -std::vector *> &gemm_implementation_list() { +const GemmImplementation *gemm_implementation_list() { return gemm_s16_methods; } /* Explicitly instantiate the external functions for these types. */ -template UniqueGemmCommon gemm(GemmArgs &args, GemmConfig *cfg); -template GemmMethod get_gemm_method(GemmArgs &args); -template bool method_is_compatible(GemmMethod method, GemmArgs &args); +template UniqueGemmCommon gemm(const GemmArgs &args); +template KernelDescription get_gemm_method(const GemmArgs &args); +template bool method_is_compatible(GemmMethod method, const GemmArgs &args); +template std::vector get_compatible_kernels (const GemmArgs &args); } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp index 627d8abdb9..34dc8bc341 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -27,67 +27,67 @@ #include "gemm_common.hpp" #include "gemm_implementation.hpp" #include "gemm_interleaved.hpp" +#include "gemm_native.hpp" #include "kernels/a64_gemm_s16_12x8.hpp" #include "kernels/a64_gemm_s8_12x8.hpp" #include "kernels/a64_gemm_s8_4x4.hpp" #include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp" +#include "kernels/sve_native_s8s32_dot_4VLx4.hpp" namespace arm_gemm { +static const GemmImplementation gemm_s8_methods[] = { #ifdef __ARM_FEATURE_SVE -class GemmImpl_gemm_s8_interleaved_dot : public GemmImplementation { -public: - UniqueGemmCommon instantiate(const GemmArgs &args) override { - return UniqueGemmCommon(new GemmInterleaved(args)); - } - - GemmImpl_gemm_s8_interleaved_dot() : GemmImplementation(GemmMethod::GEMM_INTERLEAVED_DOT) { } -}; -#else - -class GemmImpl_gemm_s8_interleaved_dot : public GemmImplementation { -public: - bool is_supported(const GemmArgs &args) override { - return args._ci->has_dotprod(); - } - - UniqueGemmCommon instantiate(const GemmArgs &args) override { - return UniqueGemmCommon(new GemmInterleaved(args)); - } - - GemmImpl_gemm_s8_interleaved_dot() : GemmImplementation(GemmMethod::GEMM_INTERLEAVED_DOT) { } -}; - +{ + GemmMethod::GEMM_NATIVE, + "native_s8s32_dot_4VLx4", + [](const GemmArgs &args) { return (args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB); }, + [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); }, + [](const GemmArgs &args) { return new GemmNative(args); } +}, +{ + GemmMethod::GEMM_INTERLEAVED, + "interleaved_s8s32_dot_3VLx8", + [](const GemmArgs &args) { return (args._Ksize>4); }, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, #endif - -class GemmImpl_gemm_s8_interleaved : public GemmImplementation { -public: - UniqueGemmCommon instantiate(const GemmArgs &args) override { - return UniqueGemmCommon(new GemmInterleaved(args)); - } - - GemmImpl_gemm_s8_interleaved() : GemmImplementation(GemmMethod::GEMM_INTERLEAVED) { } -}; - -static GemmImpl_gemm_s8_interleaved_dot gemm_s8_interleaved_dot_impl{}; -static GemmImpl_gemm_s8_interleaved gemm_s8_interleaved_impl{}; - -static std::vector *> gemm_s8_methods = { - &gemm_s8_interleaved_dot_impl, - &gemm_s8_interleaved_impl +{ + GemmMethod::GEMM_INTERLEAVED, + "gemm_s8_12x8", + [](const GemmArgs &args) { return args._ci->has_dotprod(); }, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +{ + GemmMethod::GEMM_INTERLEAVED, + "gemm_s8_4x4", + nullptr, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +{ + GemmMethod::DEFAULT, + "", + nullptr, + nullptr, + nullptr +} }; template<> -std::vector *> &gemm_implementation_list() { +const GemmImplementation *gemm_implementation_list() { return gemm_s8_methods; } /* Explicitly instantiate the external functions for these types. */ -template UniqueGemmCommon gemm(GemmArgs &args, GemmConfig *cfg); -template GemmMethod get_gemm_method(GemmArgs &args); -template bool method_is_compatible(GemmMethod method, GemmArgs &args); +template UniqueGemmCommon gemm(const GemmArgs &args); +template KernelDescription get_gemm_method(const GemmArgs &args); +template bool method_is_compatible(GemmMethod method, const GemmArgs &args); +template std::vector get_compatible_kernels (const GemmArgs &args); } // namespace arm_gemm -#endif // __aarch64__ +#endif // __aarch64__ \ No newline at end of file diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp index 0e58a4d01f..436438f351 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -318,50 +318,57 @@ public: /* Constructor */ GemmInterleaved(const GemmArgs &args) - : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), - _nbatches(args._nbatches), _nmulti(args._nmulti), _trA(args._trA), _trB(args._trB), - _alpha(args._alpha), _beta(args._beta), _maxthreads(args._maxthreads), _nthreads(args._maxthreads), - _pretransposed(args._pretransposed_hint) { + : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), + _nbatches(args._nbatches), _nmulti(args._nmulti), _trA(args._trA), _trB(args._trB), + _alpha(args._alpha), _beta(args._beta), _maxthreads(args._maxthreads), _nthreads(args._maxthreads), + _pretransposed(args._pretransposed_hint) { const unsigned int L1_size = _ci->get_L1_cache_size(); const unsigned int L2_size = _ci->get_L2_cache_size(); assert(_maxthreads > 0); - // Work out blocking parameters - - // k_block: Find out how much of the larger array can be loaded into half the cache. - // This should account for associative caches. - _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); + // Work out blocking parameters, or override from provided GemmConfig + if (args._cfg && args._cfg->inner_block_size) { + _k_block = args._cfg->inner_block_size; + } else { + // k_block: Find out how much of the larger array can be loaded into half the cache. + // This should account for associative caches. + _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height()))); - // Needs to be (at least a single) multiple of the K unroll level. - _k_block /= strategy::k_unroll(); - _k_block = std::max(_k_block, 1U) * strategy::k_unroll(); + // Needs to be (at least a single) multiple of the K unroll level. + _k_block /= strategy::k_unroll(); + _k_block = std::max(_k_block, 1U) * strategy::k_unroll(); - // Now tune to presented problem size; this is how many blocks we need. - int num_k_blocks = iceildiv(_Ksize, _k_block); + // Now tune to presented problem size; this is how many blocks we need. + int num_k_blocks = iceildiv(_Ksize, _k_block); - // So divide the space equally into that many blocks. - _k_block = iceildiv(_Ksize, num_k_blocks); + // So divide the space equally into that many blocks. + _k_block = iceildiv(_Ksize, num_k_blocks); - // And round UP to the K unroll level required. - _k_block = iceildiv(_k_block, strategy::k_unroll()); - _k_block *= strategy::k_unroll(); + // And round UP to the K unroll level required. + _k_block = iceildiv(_k_block, strategy::k_unroll()); + _k_block *= strategy::k_unroll(); + } - // x_block: Work out how many rows (of length k_block) will fit in the L2 - // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. - _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / - (sizeof(Toi) * _k_block); + if (args._cfg && args._cfg->outer_block_size) { + _x_block = args._cfg->outer_block_size; + } else { + // x_block: Work out how many rows (of length k_block) will fit in the L2 + // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents. + _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / + (sizeof(Toi) * _k_block); - // Needs to be (at least a single) multiple of the kernel output width. - _x_block /= strategy::out_width(); - _x_block = std::max(_x_block, 1U) * strategy::out_width(); + // Needs to be (at least a single) multiple of the kernel output width. + _x_block /= strategy::out_width(); + _x_block = std::max(_x_block, 1U) * strategy::out_width(); - // And tune to the presented problem size. - int num_x_blocks = iceildiv(_Nsize, _x_block); - _x_block = iceildiv(_Nsize, num_x_blocks); + // And tune to the presented problem size. + int num_x_blocks = iceildiv(_Nsize, _x_block); + _x_block = iceildiv(_Nsize, num_x_blocks); - _x_block = iceildiv(_x_block, strategy::out_width()); - _x_block *= strategy::out_width(); + _x_block = iceildiv(_x_block, strategy::out_width()); + _x_block *= strategy::out_width(); + } // Work out the rounded size of M - needed for some buffers. _Mround = iceildiv(_Msize, strategy::out_height()); diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp index baa1316745..579533418d 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -74,11 +74,11 @@ public: GemmNative(GemmNative &) = delete; GemmNative & operator= (GemmNative &) = delete; - GemmNative(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const unsigned int nbatches, const unsigned int nmultis, const Tr beta) : - _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmultis(nmultis), _beta(beta), _ci(ci) { + GemmNative(const GemmArgs &args) + : _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), _nbatches(args._nbatches), _nmultis(args._nmulti), _beta(args._beta), _ci(args._ci) { /* For now don't do any blocking. TODO: figure out if we should. */ - k_block = K; - n_block = N; + k_block = _Ksize; + n_block = _Nsize; } // Window is amount per multi multiplied by total number of multis. @@ -105,8 +105,13 @@ public: unsigned int y0 = batch_pos * strategy::out_height(); - for (unsigned int pos=start; pos0; ) { + // Do work from here to the end of the current batch/multi + const unsigned int ymax = std::min(y0 + (l * strategy::out_height()), _Msize); + + // Work out how many units this is and subtract from loop counter. + l -= ((ymax - y0) + (strategy::out_height() - 1)) / strategy::out_height(); + #ifdef CYCLE_PROFILING auto p = prof.ScopedProfiler(PROFILE_KERNEL, (ymax-y0) * _Nsize * _Ksize); #endif @@ -117,7 +122,7 @@ public: _beta, (ymax-y0), _Nsize, _Ksize); /* Advance to next item */ - y0 += strategy::out_height(); + y0 = ymax; /* Check for batch/multi overflow */ if (y0 >= _Msize) { diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp index feea4829d1..6bcbca9e8b 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -32,31 +32,34 @@ namespace arm_gemm { -class GemmImpl_gemm_u16_interleaved : public GemmImplementation { -public: - UniqueGemmCommon instantiate(const GemmArgs &args) override { - return UniqueGemmCommon(new GemmInterleaved(args)); - } - - GemmImpl_gemm_u16_interleaved() : GemmImplementation(GemmMethod::GEMM_INTERLEAVED) { } -}; - -static GemmImpl_gemm_u16_interleaved gemm_u16_interleaved_impl{}; - -static std::vector *> gemm_u16_methods = { - &gemm_u16_interleaved_impl +static const GemmImplementation gemm_u16_methods[] = { +{ + GemmMethod::GEMM_INTERLEAVED, + "gemm_u16_12x8", + nullptr, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +{ + GemmMethod::DEFAULT, + "", + nullptr, + nullptr, + nullptr +} }; template<> -std::vector *> &gemm_implementation_list() { +const GemmImplementation *gemm_implementation_list() { return gemm_u16_methods; } /* Explicitly instantiate the external functions for these types. */ -template UniqueGemmCommon gemm(GemmArgs &args, GemmConfig *cfg); -template GemmMethod get_gemm_method(GemmArgs &args); -template bool method_is_compatible(GemmMethod method, GemmArgs &args); +template UniqueGemmCommon gemm(const GemmArgs &args); +template KernelDescription get_gemm_method(const GemmArgs &args); +template bool method_is_compatible(GemmMethod method, const GemmArgs &args); +template std::vector get_compatible_kernels (const GemmArgs &args); } // namespace arm_gemm -#endif // __aarch64__ +#endif // __aarch64__ \ No newline at end of file diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp index b7c1bab6bd..3c8df3f044 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -27,64 +27,66 @@ #include "gemm_common.hpp" #include "gemm_implementation.hpp" #include "gemm_interleaved.hpp" +#include "gemm_native.hpp" #include "kernels/a64_gemm_u16_12x8.hpp" #include "kernels/a64_gemm_u8_12x8.hpp" #include "kernels/a64_gemm_u8_4x4.hpp" #include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp" +#include "kernels/sve_native_u8u32_dot_4VLx4.hpp" namespace arm_gemm { +static const GemmImplementation gemm_u8_methods[] = { #ifdef __ARM_FEATURE_SVE -class GemmImpl_gemm_u8_interleaved_dot : public GemmImplementation { -public: - UniqueGemmCommon instantiate(const GemmArgs &args) override { - return UniqueGemmCommon(new GemmInterleaved(args)); - } - - GemmImpl_gemm_u8_interleaved_dot() : GemmImplementation(GemmMethod::GEMM_INTERLEAVED_DOT) { } -}; -#else -class GemmImpl_gemm_u8_interleaved_dot : public GemmImplementation { -public: - bool is_supported(const GemmArgs &args) override { - return args._ci->has_dotprod(); - } - - UniqueGemmCommon instantiate(const GemmArgs &args) override { - return UniqueGemmCommon(new GemmInterleaved(args)); - } - - GemmImpl_gemm_u8_interleaved_dot() : GemmImplementation(GemmMethod::GEMM_INTERLEAVED_DOT) { } -}; +{ + GemmMethod::GEMM_NATIVE, + "native_u8u32_dot_4VLx4", + [](const GemmArgs &args) { return (args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB); }, + [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); }, + [](const GemmArgs &args) { return new GemmNative(args); } +}, +{ + GemmMethod::GEMM_INTERLEAVED, + "interleaved_u8u32_dot_3VLx8", + [](const GemmArgs &args) { return (args._Ksize>4); }, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, #endif - -class GemmImpl_gemm_u8_interleaved : public GemmImplementation { -public: - UniqueGemmCommon instantiate(const GemmArgs &args) override { - return UniqueGemmCommon(new GemmInterleaved(args)); - } - - GemmImpl_gemm_u8_interleaved() : GemmImplementation(GemmMethod::GEMM_INTERLEAVED) { } -}; - -static GemmImpl_gemm_u8_interleaved_dot gemm_u8_interleaved_dot_impl{}; -static GemmImpl_gemm_u8_interleaved gemm_u8_interleaved_impl{}; - -static std::vector *> gemm_u8_methods = { - &gemm_u8_interleaved_dot_impl, - &gemm_u8_interleaved_impl +{ + GemmMethod::GEMM_INTERLEAVED, + "gemm_u8_12x8", + [](const GemmArgs &args) { return args._ci->has_dotprod(); }, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +{ + GemmMethod::GEMM_INTERLEAVED, + "gemm_u8_4x4", + nullptr, + nullptr, + [](const GemmArgs &args) { return new GemmInterleaved(args); } +}, +{ + GemmMethod::DEFAULT, + "", + nullptr, + nullptr, + nullptr +} }; template<> -std::vector *> &gemm_implementation_list() { +const GemmImplementation *gemm_implementation_list() { return gemm_u8_methods; } /* Explicitly instantiate the external functions for these types. */ -template UniqueGemmCommon gemm(GemmArgs &args, GemmConfig *cfg); -template GemmMethod get_gemm_method(GemmArgs &args); -template bool method_is_compatible(GemmMethod method, GemmArgs &args); +template UniqueGemmCommon gemm(const GemmArgs &args); +template KernelDescription get_gemm_method(const GemmArgs &args); +template bool method_is_compatible(GemmMethod method, const GemmArgs &args); +template std::vector get_compatible_kernels (const GemmArgs &args); } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp index d65971e47d..40f7f2b7cd 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -41,7 +41,7 @@ public: GemmArgs newargs = args; newargs._Msize = args._nbatches; newargs._nbatches = 1; - _subgemm = gemm(newargs, nullptr); + _subgemm = gemm(newargs); } void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride, diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp index 241c5fea27..5cf42761e6 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -64,10 +64,11 @@ public: GemvNativeTransposed(GemvNativeTransposed &) = delete; GemvNativeTransposed & operator= (GemvNativeTransposed &) = delete; - GemvNativeTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const Tr beta) : _Nsize(N), _Ksize(K), _nmultis(nmultis), _beta(beta), _ci(ci) { + GemvNativeTransposed(const GemmArgs &args) + : _Nsize(args._Nsize), _Ksize(args._Ksize), _nmultis(args._nmulti), _beta(args._beta), _ci(args._ci) { /* For now don't do any blocking. TODO: figure out if we should. */ - m_block = K; - n_block = N; + m_block = _Ksize; + n_block = _Nsize; } // Window is number of out_width blocks times number of multis. diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp index e53ddb26c1..842339ef23 100644 --- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp +++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -68,12 +68,21 @@ public: GemvPretransposed(GemvPretransposed &) = delete; GemvPretransposed & operator= (GemvPretransposed &) = delete; - GemvPretransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const bool trB, const Tr beta) : - _Nsize(N), _Ksize(K), _nmultis(nmultis), _trB(trB), _beta(beta), _ci(ci), - _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave) * strategy::A_interleave) { + GemvPretransposed(const GemmArgs &args) + : _Nsize(args._Nsize), _Ksize(args._Ksize), _nmultis(args._nmulti), _trB(args._trB), _beta(args._beta), _ci(args._ci), + _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave) * strategy::A_interleave) { /* For now don't do any blocking. TODO: figure out if we should. */ - m_block = K; - n_block = N; + if (args._cfg && args._cfg->inner_block_size) { + m_block = args._cfg->inner_block_size; + } else { + m_block = _Ksize; + } + + if (args._cfg && args._cfg->outer_block_size) { + n_block = args._cfg->outer_block_size; + } else { + n_block = _Nsize; + } } // Window is number of out_width blocks, times number of multis. diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp index 418a375a61..4ad38cbf62 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -32,9 +32,9 @@ // Kernel implementation. // // Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. -// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order. // Assume that "Cpanel" points to a chunk of C output blocks (each size -// 12x8), the chunks being arranged in a row major fashion. +// 24x8), the chunks being arranged in a row major fashion. // // Note that the intent of this is that either ablocks or bblocks will be 1 // - this construction allows the output loop to proceed in either order. diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp new file mode 100644 index 0000000000..0c387ff6df --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +namespace arm_gemm { + +// Actual kernel implementations +void a64_sgemm_nativeA_pretransposeB_16x4(const float *, int, const float *, float *, int, float, unsigned int, unsigned int, unsigned int); + +// Native A/Pretranspose B SGEMM "strategy" class. +// +// This describes the characteristics of a family of kernels, in terms of +// the required interleave properties and the output block size. +// +// All kernels in the family must share these characteristics. The actual +// kernel to be used can be chosen at runtime, based on the CPUInfo +// structure. +class sgemm_nativeA_pretransposeB_16x4 { +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, int, const float *, float *, int, float, unsigned int, unsigned int, unsigned int); + + /* Desired data layout for B buffer (used for pretranspose) */ + static const int B_interleave = 16; + static const int B_block = 1; + static const bool B_transpose = true; + + /* Kernel blocking parameters */ + static int out_width() { + return 16; + } + + static int out_height() { + return 4; + } + + static int k_unroll() { + return 1; + } + + StdTransformsFixed transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_sgemm_nativeA_pretransposeB_16x4; + + sgemm_nativeA_pretransposeB_16x4(const CPUInfo *ci) { + + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp new file mode 100644 index 0000000000..b2516f8797 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp @@ -0,0 +1,970 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +#include +#include +#include + +#include + +namespace arm_gemm { + +void a64_sgemm_nativeA_pretransposeB_16x4(const float *A, int lda, const float *B_panel, float *C, int ldc, float beta, unsigned int numrows, unsigned int numcols, unsigned int K) { + const bool oddk = ((K % 8) >= 4); + const bool beta0 = (beta == 0.0f); + const unsigned int oddones = (K % 4); + + /* Use some small temporary arrays to cope with "ragged" M/N sizes. + * + * "dummy_A_buf" is used to avoid overreading the A input for ragged M, + * and also for output if N is not ragged. + * + * Since the B input is pretransposed it will be padded as needed, so no + * need to worry about overreading that. + * + * "C_buf" is used to avoid overreading or overwriting the output for + * ragged N cases. + */ + float dummy_A_buf[16]; + float C_buf[64]; + + std::memset(dummy_A_buf, 0, sizeof(dummy_A_buf)); + std::memset(C_buf, 0, sizeof(C_buf)); + + for (unsigned int y=0; y 1) ? 32 : 0; + const unsigned long a_incr2 = (active_rows > 2) ? 32 : 0; + const unsigned long a_incr3 = (active_rows > 3) ? 32 : 0; + + /* Starting points for A pointers on this loop */ + const float * const a_ptr0_base = A + (y * lda); + const float * const a_ptr1_base = (active_rows > 1) ? (a_ptr0_base + lda) : dummy_A_buf; + const float * const a_ptr2_base = (active_rows > 2) ? (a_ptr1_base + lda) : dummy_A_buf; + const float * const a_ptr3_base = (active_rows > 3) ? (a_ptr2_base + lda) : dummy_A_buf; + + /* Starting points for C pointers on this loop */ + float *c_ptr0 = C + (y * ldc); + float *c_ptr1 = (active_rows > 1) ? (c_ptr0 + ldc) : dummy_A_buf; + float *c_ptr2 = (active_rows > 2) ? (c_ptr1 + ldc) : dummy_A_buf; + float *c_ptr3 = (active_rows > 3) ? (c_ptr2 + ldc) : dummy_A_buf; + + for (unsigned int x0=0; x0() * 4; + } + + static int k_unroll() + { + return 1; + } + + StdTransformsSVE transforms = {}; + + // Default to the generic kernel + kern_type kernel=sve_hybrid_fp32_mla_4VLx4; + + hybrid_fp32_mla_4VLx4(const CPUInfo *ci) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp new file mode 100644 index 0000000000..b8aa8252d1 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp @@ -0,0 +1,2005 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include + + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void sve_hybrid_fp32_mla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) { + const long beta0 = (beta == 0.0f); + const int K_stride = K; + const long loops_count = ((K + 4) / 8) - 1; + K -= loops_count * 8; + const long regs_count = (K / 4) - 1; + K -= (regs_count + 1) * 4; + const long leftovers = K; + + for (int y=0; y())) { + const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); + const float *betaptr = β + long loops = loops_count; + long regs = regs_count; + long temp = 0; + long blocks = leftovers; + const float *a_ptr0 = a_ptr0_base; + const float *b_ptr0 = B + (K_stride * x0); + + switch(M-y) { + case 1: + __asm __volatile ( + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "mov z18.s, #0\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z19.s, #0\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "fmul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "2:\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 3f\n" + "4:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "b.ne 4b\n" + "3:\n" + "cbz %[regs], 5f\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "cbz %[blocks], 6f\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "b.eq 6f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "b.eq 6f\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "b 6f\n" + "5:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "cbz %[blocks], 6f\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "b.eq 6f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "b.eq 6f\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "6:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "c_ptr1 .req X1\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "mov z19.s, #0\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z20.s, #0\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "mov z21.s, #0\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z22.s, #0\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "mov z23.s, #0\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "fmul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "fmul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "fmul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "fmul z19.s, p7/m, z19.s, z15.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmul z20.s, p7/m, z20.s, z15.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmul z23.s, p7/m, z23.s, z15.s\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "2:\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 3f\n" + "4:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "b.ne 4b\n" + "3:\n" + "cbz %[regs], 5f\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "cbz %[blocks], 6f\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "b.eq 6f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "b.eq 6f\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "b 6f\n" + "5:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "cbz %[blocks], 6f\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "b.eq 6f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "b.eq 6f\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "6:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq c_ptr1\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "c_ptr1 .req X2\n" + "c_ptr2 .req X3\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z19.s, #0\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "mov z20.s, #0\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z21.s, #0\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "mov z22.s, #0\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z23.s, #0\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "mov z24.s, #0\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "mov z25.s, #0\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "mov z26.s, #0\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z27.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add a_ptr2, a_ptr2, #0x10\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "fmul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "fmul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "fmul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "fmul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z24.s, p0/z, [c_ptr2]\n" + "fmul z20.s, p7/m, z20.s, z15.s\n" + "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "fmul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" + "fmul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" + "fmul z23.s, p7/m, z23.s, z15.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmul z24.s, p7/m, z24.s, z15.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmul z25.s, p7/m, z25.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmul z26.s, p7/m, z26.s, z15.s\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmul z27.s, p7/m, z27.s, z15.s\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "2:\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 3f\n" + "4:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, #0x20\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z24.s, z12.s, z6.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "fmla z25.s, z13.s, z6.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z26.s, z14.s, z6.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "fmla z27.s, z15.s, z6.s[3]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "b.ne 4b\n" + "3:\n" + "cbz %[regs], 5f\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z24.s, z12.s, z6.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "fmla z25.s, z13.s, z6.s[3]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z26.s, z14.s, z6.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "fmla z27.s, z15.s, z6.s[3]\n" + "cbz %[blocks], 6f\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "b.eq 6f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "b.eq 6f\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "b 6f\n" + "5:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "cbz %[blocks], 6f\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "b.eq 6f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "b.eq 6f\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "6:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + "st1w z24.s, p0, [c_ptr2]\n" + "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" + "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" + "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory" + ); + break; + default: + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z19.s, #0\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z20.s, #0\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "mov z21.s, #0\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z22.s, #0\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "mov z23.s, #0\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z24.s, #0\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "mov z25.s, #0\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "mov z26.s, #0\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "mov z27.s, #0\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z28.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z29.s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "mov z30.s, #0\n" + "add a_ptr2, a_ptr2, #0x10\n" + "mov z31.s, #0\n" + "add a_ptr3, a_ptr3, #0x10\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "fmul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "fmul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "fmul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "fmul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z24.s, p0/z, [c_ptr2]\n" + "fmul z20.s, p7/m, z20.s, z15.s\n" + "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "fmul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" + "fmul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" + "fmul z23.s, p7/m, z23.s, z15.s\n" + "ld1w z28.s, p0/z, [c_ptr3]\n" + "fmul z24.s, p7/m, z24.s, z15.s\n" + "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n" + "fmul z25.s, p7/m, z25.s, z15.s\n" + "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n" + "fmul z26.s, p7/m, z26.s, z15.s\n" + "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n" + "fmul z27.s, p7/m, z27.s, z15.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmul z28.s, p7/m, z28.s, z15.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmul z29.s, p7/m, z29.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmul z30.s, p7/m, z30.s, z15.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmul z31.s, p7/m, z31.s, z15.s\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "2:\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 3f\n" + "4:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "fmla z28.s, z8.s, z3.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + "fmla z29.s, z9.s, z3.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "add a_ptr2, a_ptr2, #0x20\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "add a_ptr3, a_ptr3, #0x20\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z30.s, z10.s, z3.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "fmla z31.s, z11.s, z3.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z28.s, z12.s, z3.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z29.s, z13.s, z3.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z30.s, z14.s, z3.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "fmla z31.s, z15.s, z3.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z28.s, z8.s, z3.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z29.s, z9.s, z3.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z30.s, z10.s, z3.s[2]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z31.s, z11.s, z3.s[2]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z28.s, z12.s, z3.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "fmla z29.s, z13.s, z3.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "fmla z30.s, z14.s, z3.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z28.s, z8.s, z7.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "fmla z29.s, z9.s, z7.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "fmla z30.s, z10.s, z7.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "fmla z31.s, z11.s, z7.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z28.s, z12.s, z7.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "fmla z29.s, z13.s, z7.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "fmla z30.s, z14.s, z7.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "fmla z31.s, z15.s, z7.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z28.s, z8.s, z7.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z29.s, z9.s, z7.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z30.s, z10.s, z7.s[2]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "fmla z31.s, z11.s, z7.s[2]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z24.s, z12.s, z6.s[3]\n" + "fmla z28.s, z12.s, z7.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "fmla z25.s, z13.s, z6.s[3]\n" + "fmla z29.s, z13.s, z7.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z26.s, z14.s, z6.s[3]\n" + "fmla z30.s, z14.s, z7.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "fmla z27.s, z15.s, z6.s[3]\n" + "fmla z31.s, z15.s, z7.s[3]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "b.ne 4b\n" + "3:\n" + "cbz %[regs], 5f\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "fmla z28.s, z8.s, z3.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z29.s, z9.s, z3.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "fmla z30.s, z10.s, z3.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "fmla z31.s, z11.s, z3.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z28.s, z12.s, z3.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z29.s, z13.s, z3.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z30.s, z14.s, z3.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "fmla z31.s, z15.s, z3.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z28.s, z8.s, z3.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z29.s, z9.s, z3.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z30.s, z10.s, z3.s[2]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z31.s, z11.s, z3.s[2]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z28.s, z12.s, z3.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "fmla z29.s, z13.s, z3.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "fmla z30.s, z14.s, z3.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z28.s, z8.s, z7.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "fmla z29.s, z9.s, z7.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "fmla z30.s, z10.s, z7.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "fmla z31.s, z11.s, z7.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z28.s, z12.s, z7.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "fmla z29.s, z13.s, z7.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "fmla z30.s, z14.s, z7.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "fmla z31.s, z15.s, z7.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z28.s, z8.s, z7.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z29.s, z9.s, z7.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z30.s, z10.s, z7.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "fmla z31.s, z11.s, z7.s[2]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z24.s, z12.s, z6.s[3]\n" + "fmla z28.s, z12.s, z7.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "fmla z25.s, z13.s, z6.s[3]\n" + "fmla z29.s, z13.s, z7.s[3]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z26.s, z14.s, z6.s[3]\n" + "fmla z30.s, z14.s, z7.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "fmla z27.s, z15.s, z6.s[3]\n" + "fmla z31.s, z15.s, z7.s[3]\n" + "cbz %[blocks], 6f\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "fmla z28.s, z8.s, z3.s[0]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z29.s, z9.s, z3.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "fmla z30.s, z10.s, z3.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "fmla z31.s, z11.s, z3.s[0]\n" + "b.eq 6f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z28.s, z12.s, z3.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z29.s, z13.s, z3.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z30.s, z14.s, z3.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "fmla z31.s, z15.s, z3.s[1]\n" + "b.eq 6f\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z28.s, z8.s, z3.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z29.s, z9.s, z3.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z30.s, z10.s, z3.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z31.s, z11.s, z3.s[2]\n" + "b 6f\n" + "5:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + "fmla z28.s, z8.s, z3.s[0]\n" + "ld1rqw z7.s, p6/z, [a_ptr3]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z29.s, z9.s, z3.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "fmla z30.s, z10.s, z3.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "fmla z31.s, z11.s, z3.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z28.s, z12.s, z3.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z29.s, z13.s, z3.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z30.s, z14.s, z3.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "fmla z31.s, z15.s, z3.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z28.s, z8.s, z3.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z29.s, z9.s, z3.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z30.s, z10.s, z3.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z31.s, z11.s, z3.s[2]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z28.s, z12.s, z3.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "fmla z29.s, z13.s, z3.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "fmla z30.s, z14.s, z3.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "cbz %[blocks], 6f\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z28.s, z8.s, z7.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "fmla z29.s, z9.s, z7.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "fmla z30.s, z10.s, z7.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "fmla z31.s, z11.s, z7.s[0]\n" + "b.eq 6f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z28.s, z12.s, z7.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "fmla z29.s, z13.s, z7.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "fmla z30.s, z14.s, z7.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "fmla z31.s, z15.s, z7.s[1]\n" + "b.eq 6f\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z28.s, z8.s, z7.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z29.s, z9.s, z7.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z30.s, z10.s, z7.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "fmla z31.s, z11.s, z7.s[2]\n" + "6:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + "st1w z24.s, p0, [c_ptr2]\n" + "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" + "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" + "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" + "st1w z28.s, p0, [c_ptr3]\n" + "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n" + "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n" + "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" + ); + break; + } + } + } +} + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE \ No newline at end of file diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp index 3fd738e673..9d88b60cee 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2018-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -43,7 +43,7 @@ public: /* Kernel blocking parameters */ static int out_width() { - return svcnth() * 3; + return get_vector_length<__fp16>() * 3; } static int out_height() diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp index 92ec888244..517895ca7f 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2018-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -48,24 +48,24 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "mov z8.h, #0\n" "ptrue p0.h\n" "mov z9.h, #0\n" - "ld1rqh z0.h, p0/z, [%[a_ptr]]\n" "mov z10.h, #0\n" - "ld1h z2.h, p0/z, [%[b_ptr]]\n" "mov z11.h, #0\n" - "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n" "mov z12.h, #0\n" - "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n" + "ld1rqh z0.h, p0/z, [%[a_ptr]]\n" "mov z13.h, #0\n" - "ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n" + "ld1h z2.h, p0/z, [%[b_ptr]]\n" "mov z14.h, #0\n" - "ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n" + "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n" "mov z15.h, #0\n" - "add %[a_ptr], %[a_ptr], #0x20\n" + "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n" "mov z16.h, #0\n" - "addvl %[b_ptr], %[b_ptr], #6\n" + "ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n" "mov z17.h, #0\n" + "ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n" "mov z18.h, #0\n" + "add %[a_ptr], %[a_ptr], #0x20\n" "mov z19.h, #0\n" + "addvl %[b_ptr], %[b_ptr], #6\n" "mov z20.h, #0\n" "mov z21.h, #0\n" "mov z22.h, #0\n" @@ -199,37 +199,31 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "fmla z30.h, z7.h, z1.h[6]\n" "fmla z31.h, z7.h, z1.h[7]\n" "fmla z8.h, z2.h, z0.h[0]\n" - "st1h z8.h, p0, [%[c_ptr]]\n" "fmla z9.h, z2.h, z0.h[1]\n" "fmla z10.h, z2.h, z0.h[2]\n" "fmla z11.h, z2.h, z0.h[3]\n" "fmla z12.h, z2.h, z0.h[4]\n" + "st1h z8.h, p0, [%[c_ptr]]\n" "fmla z13.h, z2.h, z0.h[5]\n" "fmla z14.h, z2.h, z0.h[6]\n" "fmla z15.h, z2.h, z0.h[7]\n" "fmla z16.h, z3.h, z0.h[0]\n" - "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n" "fmla z17.h, z3.h, z0.h[1]\n" "fmla z18.h, z3.h, z0.h[2]\n" "fmla z19.h, z3.h, z0.h[3]\n" "fmla z20.h, z3.h, z0.h[4]\n" + "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n" "fmla z21.h, z3.h, z0.h[5]\n" "fmla z22.h, z3.h, z0.h[6]\n" "fmla z23.h, z3.h, z0.h[7]\n" "fmla z24.h, z4.h, z0.h[0]\n" - "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n" "fmla z25.h, z4.h, z0.h[1]\n" - "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n" "fmla z26.h, z4.h, z0.h[2]\n" - "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n" "fmla z27.h, z4.h, z0.h[3]\n" - "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n" "fmla z28.h, z4.h, z0.h[4]\n" - "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n" + "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n" "fmla z29.h, z4.h, z0.h[5]\n" - "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n" "fmla z30.h, z4.h, z0.h[6]\n" - "addvl %[c_ptr], %[c_ptr], #16\n" "fmla z31.h, z4.h, z0.h[7]\n" "b 4f\n" "3:\n" @@ -260,39 +254,39 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "fmla z30.h, z4.h, z0.h[6]\n" "fmla z31.h, z4.h, z0.h[7]\n" "fmla z8.h, z5.h, z1.h[0]\n" - "st1h z8.h, p0, [%[c_ptr]]\n" "fmla z9.h, z5.h, z1.h[1]\n" "fmla z10.h, z5.h, z1.h[2]\n" "fmla z11.h, z5.h, z1.h[3]\n" "fmla z12.h, z5.h, z1.h[4]\n" + "st1h z8.h, p0, [%[c_ptr]]\n" "fmla z13.h, z5.h, z1.h[5]\n" "fmla z14.h, z5.h, z1.h[6]\n" "fmla z15.h, z5.h, z1.h[7]\n" "fmla z16.h, z6.h, z1.h[0]\n" - "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n" "fmla z17.h, z6.h, z1.h[1]\n" "fmla z18.h, z6.h, z1.h[2]\n" "fmla z19.h, z6.h, z1.h[3]\n" "fmla z20.h, z6.h, z1.h[4]\n" + "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n" "fmla z21.h, z6.h, z1.h[5]\n" "fmla z22.h, z6.h, z1.h[6]\n" "fmla z23.h, z6.h, z1.h[7]\n" "fmla z24.h, z7.h, z1.h[0]\n" - "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n" "fmla z25.h, z7.h, z1.h[1]\n" - "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n" "fmla z26.h, z7.h, z1.h[2]\n" - "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n" "fmla z27.h, z7.h, z1.h[3]\n" - "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n" "fmla z28.h, z7.h, z1.h[4]\n" - "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n" + "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n" "fmla z29.h, z7.h, z1.h[5]\n" - "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n" "fmla z30.h, z7.h, z1.h[6]\n" - "addvl %[c_ptr], %[c_ptr], #16\n" "fmla z31.h, z7.h, z1.h[7]\n" "4:\n" + "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n" + "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n" + "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n" + "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n" + "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n" + "addvl %[c_ptr], %[c_ptr], #16\n" "st1h z26.h, p0, [%[c_ptr], #-8, MUL VL]\n" "st1h z11.h, p0, [%[c_ptr], #-7, MUL VL]\n" "st1h z19.h, p0, [%[c_ptr], #-6, MUL VL]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp index b2327f3070..2e8f261fe1 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2018-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -43,7 +43,7 @@ public: /* Kernel blocking parameters */ static int out_width() { - return svcntw() * 3; + return get_vector_length() * 3; } static int out_height() diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp index bb08fc7cb0..88c984018e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2018-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -48,22 +48,22 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "mov z8.s, #0\n" "ptrue p0.s\n" "mov z9.s, #0\n" - "ld1rqw z0.s, p0/z, [%[a_ptr]]\n" "mov z10.s, #0\n" - "ld1w z4.s, p0/z, [%[b_ptr]]\n" "mov z11.s, #0\n" - "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n" "mov z12.s, #0\n" - "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n" + "ld1rqw z0.s, p0/z, [%[a_ptr]]\n" "mov z13.s, #0\n" - "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n" + "ld1w z4.s, p0/z, [%[b_ptr]]\n" "mov z14.s, #0\n" - "add %[a_ptr], %[a_ptr], #0x40\n" + "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n" "mov z15.s, #0\n" - "addvl %[b_ptr], %[b_ptr], #3\n" + "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n" "mov z16.s, #0\n" + "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n" "mov z17.s, #0\n" + "add %[a_ptr], %[a_ptr], #0x40\n" "mov z18.s, #0\n" + "addvl %[b_ptr], %[b_ptr], #3\n" "mov z19.s, #0\n" "mov z20.s, #0\n" "mov z21.s, #0\n" @@ -204,37 +204,31 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "fmla z31.s, z6.s, z3.s[3]\n" "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n" "fmla z8.s, z4.s, z0.s[0]\n" - "st1w z8.s, p0, [%[c_ptr]]\n" "fmla z9.s, z4.s, z0.s[1]\n" "fmla z10.s, z4.s, z0.s[2]\n" "fmla z11.s, z4.s, z0.s[3]\n" "fmla z20.s, z4.s, z1.s[0]\n" + "st1w z8.s, p0, [%[c_ptr]]\n" "fmla z21.s, z4.s, z1.s[1]\n" "fmla z22.s, z4.s, z1.s[2]\n" "fmla z23.s, z4.s, z1.s[3]\n" "fmla z12.s, z5.s, z0.s[0]\n" - "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" "fmla z13.s, z5.s, z0.s[1]\n" "fmla z14.s, z5.s, z0.s[2]\n" "fmla z15.s, z5.s, z0.s[3]\n" "fmla z24.s, z5.s, z1.s[0]\n" + "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" "fmla z25.s, z5.s, z1.s[1]\n" "fmla z26.s, z5.s, z1.s[2]\n" "fmla z27.s, z5.s, z1.s[3]\n" "fmla z16.s, z6.s, z0.s[0]\n" - "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" "fmla z17.s, z6.s, z0.s[1]\n" - "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "fmla z18.s, z6.s, z0.s[2]\n" - "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n" "fmla z19.s, z6.s, z0.s[3]\n" - "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n" "fmla z28.s, z6.s, z1.s[0]\n" - "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n" + "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" "fmla z29.s, z6.s, z1.s[1]\n" - "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n" "fmla z30.s, z6.s, z1.s[2]\n" - "addvl %[c_ptr], %[c_ptr], #16\n" "fmla z31.s, z6.s, z1.s[3]\n" "b 4f\n" "3:\n" @@ -269,39 +263,39 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "fmla z31.s, z6.s, z1.s[3]\n" "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n" "fmla z8.s, z4.s, z2.s[0]\n" - "st1w z8.s, p0, [%[c_ptr]]\n" "fmla z9.s, z4.s, z2.s[1]\n" "fmla z10.s, z4.s, z2.s[2]\n" "fmla z11.s, z4.s, z2.s[3]\n" "fmla z20.s, z4.s, z3.s[0]\n" + "st1w z8.s, p0, [%[c_ptr]]\n" "fmla z21.s, z4.s, z3.s[1]\n" "fmla z22.s, z4.s, z3.s[2]\n" "fmla z23.s, z4.s, z3.s[3]\n" "fmla z12.s, z5.s, z2.s[0]\n" - "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" "fmla z13.s, z5.s, z2.s[1]\n" "fmla z14.s, z5.s, z2.s[2]\n" "fmla z15.s, z5.s, z2.s[3]\n" "fmla z24.s, z5.s, z3.s[0]\n" + "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" "fmla z25.s, z5.s, z3.s[1]\n" "fmla z26.s, z5.s, z3.s[2]\n" "fmla z27.s, z5.s, z3.s[3]\n" "fmla z16.s, z6.s, z2.s[0]\n" - "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" "fmla z17.s, z6.s, z2.s[1]\n" - "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "fmla z18.s, z6.s, z2.s[2]\n" - "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n" "fmla z19.s, z6.s, z2.s[3]\n" - "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n" "fmla z28.s, z6.s, z3.s[0]\n" - "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n" + "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" "fmla z29.s, z6.s, z3.s[1]\n" - "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n" "fmla z30.s, z6.s, z3.s[2]\n" - "addvl %[c_ptr], %[c_ptr], #16\n" "fmla z31.s, z6.s, z3.s[3]\n" "4:\n" + "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" + "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n" + "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n" + "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n" + "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n" + "addvl %[c_ptr], %[c_ptr], #16\n" "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n" "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n" "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp index 91aa567d4a..67154e6a3f 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2018-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -43,7 +43,7 @@ public: /* Kernel blocking parameters */ static int out_width() { - return svcntw() * 3; + return get_vector_length() * 3; } static int out_height() diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp index 2e994a13f3..d679c211ef 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2018-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -49,22 +49,22 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, "mov z8.s, #0\n" "ptrue p0.b\n" "mov z9.s, #0\n" - "ld1rqb z0.b, p0/z, [%[a_ptr]]\n" "mov z10.s, #0\n" - "ld1b z4.b, p0/z, [%[b_ptr]]\n" "mov z11.s, #0\n" - "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n" "mov z12.s, #0\n" - "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n" + "ld1rqb z0.b, p0/z, [%[a_ptr]]\n" "mov z13.s, #0\n" - "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n" + "ld1b z4.b, p0/z, [%[b_ptr]]\n" "mov z14.s, #0\n" - "add %[a_ptr], %[a_ptr], #0x40\n" + "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n" "mov z15.s, #0\n" - "addvl %[b_ptr], %[b_ptr], #3\n" + "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n" "mov z16.s, #0\n" + "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n" "mov z17.s, #0\n" + "add %[a_ptr], %[a_ptr], #0x40\n" "mov z18.s, #0\n" + "addvl %[b_ptr], %[b_ptr], #3\n" "mov z19.s, #0\n" "mov z20.s, #0\n" "mov z21.s, #0\n" @@ -205,37 +205,31 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, "sdot z31.s, z6.b, z3.b[3]\n" "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n" "sdot z8.s, z4.b, z0.b[0]\n" - "st1w z8.s, p0, [%[c_ptr]]\n" "sdot z9.s, z4.b, z0.b[1]\n" "sdot z10.s, z4.b, z0.b[2]\n" "sdot z11.s, z4.b, z0.b[3]\n" "sdot z20.s, z4.b, z1.b[0]\n" + "st1w z8.s, p0, [%[c_ptr]]\n" "sdot z21.s, z4.b, z1.b[1]\n" "sdot z22.s, z4.b, z1.b[2]\n" "sdot z23.s, z4.b, z1.b[3]\n" "sdot z12.s, z5.b, z0.b[0]\n" - "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" "sdot z13.s, z5.b, z0.b[1]\n" "sdot z14.s, z5.b, z0.b[2]\n" "sdot z15.s, z5.b, z0.b[3]\n" "sdot z24.s, z5.b, z1.b[0]\n" + "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" "sdot z25.s, z5.b, z1.b[1]\n" "sdot z26.s, z5.b, z1.b[2]\n" "sdot z27.s, z5.b, z1.b[3]\n" "sdot z16.s, z6.b, z0.b[0]\n" - "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" "sdot z17.s, z6.b, z0.b[1]\n" - "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "sdot z18.s, z6.b, z0.b[2]\n" - "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n" "sdot z19.s, z6.b, z0.b[3]\n" - "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n" "sdot z28.s, z6.b, z1.b[0]\n" - "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n" + "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" "sdot z29.s, z6.b, z1.b[1]\n" - "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n" "sdot z30.s, z6.b, z1.b[2]\n" - "addvl %[c_ptr], %[c_ptr], #16\n" "sdot z31.s, z6.b, z1.b[3]\n" "b 4f\n" "3:\n" @@ -270,39 +264,39 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, "sdot z31.s, z6.b, z1.b[3]\n" "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n" "sdot z8.s, z4.b, z2.b[0]\n" - "st1w z8.s, p0, [%[c_ptr]]\n" "sdot z9.s, z4.b, z2.b[1]\n" "sdot z10.s, z4.b, z2.b[2]\n" "sdot z11.s, z4.b, z2.b[3]\n" "sdot z20.s, z4.b, z3.b[0]\n" + "st1w z8.s, p0, [%[c_ptr]]\n" "sdot z21.s, z4.b, z3.b[1]\n" "sdot z22.s, z4.b, z3.b[2]\n" "sdot z23.s, z4.b, z3.b[3]\n" "sdot z12.s, z5.b, z2.b[0]\n" - "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" "sdot z13.s, z5.b, z2.b[1]\n" "sdot z14.s, z5.b, z2.b[2]\n" "sdot z15.s, z5.b, z2.b[3]\n" "sdot z24.s, z5.b, z3.b[0]\n" + "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" "sdot z25.s, z5.b, z3.b[1]\n" "sdot z26.s, z5.b, z3.b[2]\n" "sdot z27.s, z5.b, z3.b[3]\n" "sdot z16.s, z6.b, z2.b[0]\n" - "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" "sdot z17.s, z6.b, z2.b[1]\n" - "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "sdot z18.s, z6.b, z2.b[2]\n" - "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n" "sdot z19.s, z6.b, z2.b[3]\n" - "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n" "sdot z28.s, z6.b, z3.b[0]\n" - "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n" + "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" "sdot z29.s, z6.b, z3.b[1]\n" - "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n" "sdot z30.s, z6.b, z3.b[2]\n" - "addvl %[c_ptr], %[c_ptr], #16\n" "sdot z31.s, z6.b, z3.b[3]\n" "4:\n" + "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" + "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n" + "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n" + "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n" + "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n" + "addvl %[c_ptr], %[c_ptr], #16\n" "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n" "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n" "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp index ef457e454f..628c5a868e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2018-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -43,7 +43,7 @@ public: /* Kernel blocking parameters */ static int out_width() { - return svcntw() * 3; + return get_vector_length() * 3; } static int out_height() diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp new file mode 100644 index 0000000000..fcc80d9fe5 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __ARM_FEATURE_SVE + + + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_native_fp32_mla_4VLx4(const float *, int, const float *, int ldb, float *, int, float, int, int, int); + +class native_fp32_mla_4VLx4 +{ +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, float, int, int, int); + + /* Kernel blocking parameters */ + static int out_height() + { + return 4; + } + + static int out_width() + { + return get_vector_length() * 4; + } + + static int k_unroll() + { + return 1; + } + + + + // Default to the generic kernel + kern_type kernel=sve_native_fp32_mla_4VLx4; + + native_fp32_mla_4VLx4(const CPUInfo *ci) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp new file mode 100644 index 0000000000..6e225669fc --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp @@ -0,0 +1,2066 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include + + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K) { + const long beta0 = (beta == 0.0f); + const long loops_count = ((K + 4) / 8) - 1; + K -= loops_count * 8; + const long regs_count = (K / 4) - 1; + K -= (regs_count + 1) * 4; + const long leftovers = K; + + for (int y=0; y())) { + const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); + const float *betaptr = β + long loops = loops_count; + long regs = regs_count; + long temp = 0; + long blocks = leftovers; + const float *a_ptr0 = a_ptr0_base; + const float *b_ptr0 = B + x0; + long ldbb = ldb * sizeof(float); + + switch(M-y) { + case 1: + __asm __volatile ( + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "mov z18.s, #0\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z19.s, #0\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "fmul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "2:\n" + "cbz %[loops], 3f\n" + "4:\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "b.ne 4b\n" + "3:\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "cbz %[regs], 5f\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "b 6f\n" + "5:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "6:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "c_ptr1 .req X1\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "mov z19.s, #0\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z20.s, #0\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "mov z21.s, #0\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z22.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z23.s, #0\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "fmul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "fmul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "fmul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "fmul z19.s, p7/m, z19.s, z15.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmul z20.s, p7/m, z20.s, z15.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmul z23.s, p7/m, z23.s, z15.s\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "2:\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "cbz %[loops], 3f\n" + "4:\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "b.ne 4b\n" + "3:\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "cbz %[regs], 5f\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "b 6f\n" + "5:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "6:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq c_ptr1\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "c_ptr1 .req X2\n" + "c_ptr2 .req X3\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z19.s, #0\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "mov z20.s, #0\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z21.s, #0\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "mov z22.s, #0\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z23.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z24.s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "mov z25.s, #0\n" + "add a_ptr2, a_ptr2, #0x10\n" + "mov z26.s, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z27.s, #0\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "fmul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "fmul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "fmul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "fmul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z24.s, p0/z, [c_ptr2]\n" + "fmul z20.s, p7/m, z20.s, z15.s\n" + "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "fmul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" + "fmul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" + "fmul z23.s, p7/m, z23.s, z15.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmul z24.s, p7/m, z24.s, z15.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmul z25.s, p7/m, z25.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmul z26.s, p7/m, z26.s, z15.s\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmul z27.s, p7/m, z27.s, z15.s\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add a_ptr2, a_ptr2, #0x10\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "2:\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "cbz %[loops], 3f\n" + "4:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, #0x20\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z24.s, z12.s, z6.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z13.s, z6.s[3]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z26.s, z14.s, z6.s[3]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "fmla z27.s, z15.s, z6.s[3]\n" + "b.ne 4b\n" + "3:\n" + "cbz %[regs], 5f\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z24.s, z12.s, z6.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "fmla z25.s, z13.s, z6.s[3]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z26.s, z14.s, z6.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "fmla z27.s, z15.s, z6.s[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "b 6f\n" + "5:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "6:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + "st1w z24.s, p0, [c_ptr2]\n" + "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" + "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" + "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory" + ); + break; + default: + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z19.s, #0\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z20.s, #0\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "mov z21.s, #0\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z22.s, #0\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "mov z23.s, #0\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z24.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z25.s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "mov z26.s, #0\n" + "add a_ptr2, a_ptr2, #0x10\n" + "mov z27.s, #0\n" + "add a_ptr3, a_ptr3, #0x10\n" + "mov z28.s, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "fmul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "fmul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "fmul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "fmul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z24.s, p0/z, [c_ptr2]\n" + "fmul z20.s, p7/m, z20.s, z15.s\n" + "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "fmul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" + "fmul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" + "fmul z23.s, p7/m, z23.s, z15.s\n" + "ld1w z28.s, p0/z, [c_ptr3]\n" + "fmul z24.s, p7/m, z24.s, z15.s\n" + "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n" + "fmul z25.s, p7/m, z25.s, z15.s\n" + "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n" + "fmul z26.s, p7/m, z26.s, z15.s\n" + "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n" + "fmul z27.s, p7/m, z27.s, z15.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmul z28.s, p7/m, z28.s, z15.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmul z29.s, p7/m, z29.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmul z30.s, p7/m, z30.s, z15.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmul z31.s, p7/m, z31.s, z15.s\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "add a_ptr3, a_ptr3, #0x10\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "2:\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "cbz %[loops], 3f\n" + "4:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z28.s, z8.s, z3.s[0]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla z29.s, z9.s, z3.s[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "add a_ptr2, a_ptr2, #0x20\n" + "fmla z30.s, z10.s, z3.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "add a_ptr3, a_ptr3, #0x20\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "fmla z31.s, z11.s, z3.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z28.s, z12.s, z3.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z29.s, z13.s, z3.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z30.s, z14.s, z3.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "fmla z31.s, z15.s, z3.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z28.s, z8.s, z3.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z29.s, z9.s, z3.s[2]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z30.s, z10.s, z3.s[2]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z31.s, z11.s, z3.s[2]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z28.s, z12.s, z3.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "fmla z29.s, z13.s, z3.s[3]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "fmla z30.s, z14.s, z3.s[3]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z28.s, z8.s, z7.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "fmla z29.s, z9.s, z7.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "fmla z30.s, z10.s, z7.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "fmla z31.s, z11.s, z7.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z28.s, z12.s, z7.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "fmla z29.s, z13.s, z7.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "fmla z30.s, z14.s, z7.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "fmla z31.s, z15.s, z7.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z28.s, z8.s, z7.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z29.s, z9.s, z7.s[2]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z30.s, z10.s, z7.s[2]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "fmla z31.s, z11.s, z7.s[2]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z24.s, z12.s, z6.s[3]\n" + "fmla z28.s, z12.s, z7.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "fmla z25.s, z13.s, z6.s[3]\n" + "fmla z29.s, z13.s, z7.s[3]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z26.s, z14.s, z6.s[3]\n" + "fmla z30.s, z14.s, z7.s[3]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "fmla z27.s, z15.s, z6.s[3]\n" + "fmla z31.s, z15.s, z7.s[3]\n" + "b.ne 4b\n" + "3:\n" + "cbz %[regs], 5f\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z28.s, z8.s, z3.s[0]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z29.s, z9.s, z3.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z30.s, z10.s, z3.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "fmla z31.s, z11.s, z3.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z28.s, z12.s, z3.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z29.s, z13.s, z3.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z30.s, z14.s, z3.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "fmla z31.s, z15.s, z3.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z28.s, z8.s, z3.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z29.s, z9.s, z3.s[2]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z30.s, z10.s, z3.s[2]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z31.s, z11.s, z3.s[2]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z28.s, z12.s, z3.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "fmla z29.s, z13.s, z3.s[3]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "fmla z30.s, z14.s, z3.s[3]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z28.s, z8.s, z7.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "fmla z29.s, z9.s, z7.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "fmla z30.s, z10.s, z7.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "fmla z31.s, z11.s, z7.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z28.s, z12.s, z7.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "fmla z29.s, z13.s, z7.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "fmla z30.s, z14.s, z7.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "fmla z31.s, z15.s, z7.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z28.s, z8.s, z7.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z29.s, z9.s, z7.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z30.s, z10.s, z7.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "fmla z31.s, z11.s, z7.s[2]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z24.s, z12.s, z6.s[3]\n" + "fmla z28.s, z12.s, z7.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "fmla z25.s, z13.s, z6.s[3]\n" + "fmla z29.s, z13.s, z7.s[3]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z26.s, z14.s, z6.s[3]\n" + "fmla z30.s, z14.s, z7.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "fmla z27.s, z15.s, z6.s[3]\n" + "fmla z31.s, z15.s, z7.s[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "fmla z28.s, z8.s, z3.s[0]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z29.s, z9.s, z3.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "fmla z30.s, z10.s, z3.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "fmla z31.s, z11.s, z3.s[0]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z28.s, z12.s, z3.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z29.s, z13.s, z3.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z30.s, z14.s, z3.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "fmla z31.s, z15.s, z3.s[1]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z28.s, z8.s, z3.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z29.s, z9.s, z3.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z30.s, z10.s, z3.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z31.s, z11.s, z3.s[2]\n" + "b 6f\n" + "5:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + "fmla z28.s, z8.s, z3.s[0]\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z7.s, p6/z, [a_ptr3]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z29.s, z9.s, z3.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z30.s, z10.s, z3.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "fmla z31.s, z11.s, z3.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z28.s, z12.s, z3.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z29.s, z13.s, z3.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z30.s, z14.s, z3.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "fmla z31.s, z15.s, z3.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z28.s, z8.s, z3.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z29.s, z9.s, z3.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z30.s, z10.s, z3.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z31.s, z11.s, z3.s[2]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z28.s, z12.s, z3.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "fmla z29.s, z13.s, z3.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "fmla z30.s, z14.s, z3.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z28.s, z8.s, z7.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "fmla z29.s, z9.s, z7.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "fmla z30.s, z10.s, z7.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "fmla z31.s, z11.s, z7.s[0]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z28.s, z12.s, z7.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "fmla z29.s, z13.s, z7.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "fmla z30.s, z14.s, z7.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "fmla z31.s, z15.s, z7.s[1]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z28.s, z8.s, z7.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z29.s, z9.s, z7.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z30.s, z10.s, z7.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "fmla z31.s, z11.s, z7.s[2]\n" + "6:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + "st1w z24.s, p0, [c_ptr2]\n" + "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" + "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" + "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" + "st1w z28.s, p0, [c_ptr3]\n" + "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n" + "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n" + "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" + ); + break; + } + } + } +} + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp new file mode 100644 index 0000000000..f5634e3618 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __ARM_FEATURE_SVE + +#include + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_native_s8s32_dot_4VLx4(const int8_t *, int, const int8_t *, int ldb, int32_t *, int, int32_t, int, int, int); + +class native_s8s32_dot_4VLx4 +{ +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int8_t *, int, const int8_t *, int ldb, int32_t *, int, int32_t, int, int, int); + + /* Kernel blocking parameters */ + static int out_height() + { + return 4; + } + + static int out_width() + { + return get_vector_length() * 4; + } + + static int k_unroll() + { + return 4; + } + + + + // Default to the generic kernel + kern_type kernel=sve_native_s8s32_dot_4VLx4; + + native_s8s32_dot_4VLx4(const CPUInfo *ci) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp new file mode 100644 index 0000000000..9c02d95044 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp @@ -0,0 +1,4632 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include + +#include +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int ldb, int32_t *C, int ldc, int32_t beta, int M, int N, int K) { + const long beta0 = (beta == 0); + const long loops_count = ((K + 16) / 32) - 1; + K -= loops_count * 32; + const long regs_count = (K / 16) - 1; + K -= (regs_count + 1) * 16; + const long leftovers = K; + const long blocks_count = K / 4; + const long odds_count = K - (blocks_count * 4); + + for (int y=0; y())) { + const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); + const int32_t *betaptr = β + long loops = loops_count; + long regs = regs_count; + long temp = 0; + long blocks = blocks_count; + long odds = odds_count; + const int8_t *a_ptr0 = a_ptr0_base; + const int8_t *b_ptr0 = B + x0; + const int8_t *b_ptr1 = b_ptr0 + ldb; + const int8_t *b_ptr2 = b_ptr1 + ldb; + const int8_t *b_ptr3 = b_ptr2 + ldb; + long ldbb = ldb * sizeof(int8_t) * 4; + + switch(M-y) { + case 1: + __asm __volatile ( + "whilelt p6.b, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "whilelt p4.b, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.b\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mov z18.s, #0\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mov z19.s, #0\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mul z16.s, p7/m, z16.s, z15.s\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mul z17.s, p7/m, z17.s, z15.s\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mul z18.s, p7/m, z18.s, z15.s\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "mul z19.s, p7/m, z19.s, z15.s\n" + "2:\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "cbz %[loops], 3f\n" + "4:\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "zip2 z13.b, z13.b, z14.b\n" + "subs %[loops], %[loops], #0x1\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "b.ne 4b\n" + "3:\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "cbz %[regs], 5f\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "b.eq 7f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "b.eq 8f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 10f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 11f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "11:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "10:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "12:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "b 9f\n" + "8:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 13f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 14f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "14:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "13:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "15:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "b 9f\n" + "7:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 16f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 17f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "17:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "16:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "18:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "b 9f\n" + "6:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 19f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 20f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "20:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "19:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "21:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "b 9f\n" + "5:\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "cbz %[blocks], 22f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "b.eq 23f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "b.eq 24f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 25f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 26f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "26:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "25:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "27:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "b 9f\n" + "24:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 28f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 29f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "29:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "28:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "30:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "b 9f\n" + "23:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 31f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 32f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "32:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "31:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "33:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "b 9f\n" + "22:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 34f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 35f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "35:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "34:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "36:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "9:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "c_ptr1 .req X1\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.b, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "whilelt p4.b, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.b\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mov z19.s, #0\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mov z20.s, #0\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "mov z21.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z22.s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "mov z23.s, #0\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "mul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "mul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "mul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "mul z19.s, p7/m, z19.s, z15.s\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mul z20.s, p7/m, z20.s, z15.s\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mul z21.s, p7/m, z21.s, z15.s\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mul z22.s, p7/m, z22.s, z15.s\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mul z23.s, p7/m, z23.s, z15.s\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip1 z10.b, z10.b, z8.b\n" + "2:\n" + "cbz %[loops], 3f\n" + "4:\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "zip2 z11.b, z8.b, z9.b\n" + "add a_ptr1, a_ptr1, #0x20\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "subs %[loops], %[loops], #0x1\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z20.s, z12.b, z5.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z21.s, z13.b, z5.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z14.b, z5.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "sdot z23.s, z15.b, z5.b[3]\n" + "b.ne 4b\n" + "3:\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "cbz %[regs], 5f\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z20.s, z12.b, z5.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z21.s, z13.b, z5.b[3]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "sdot z22.s, z14.b, z5.b[3]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "sdot z23.s, z15.b, z5.b[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "b.eq 7f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "b.eq 8f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 10f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 11f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "11:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "10:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "12:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "b 9f\n" + "8:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 13f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 14f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "14:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "13:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "15:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "b 9f\n" + "7:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 16f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 17f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "17:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "16:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "18:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "b 9f\n" + "6:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 19f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 20f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "20:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "19:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "21:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "b 9f\n" + "5:\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "cbz %[blocks], 22f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "b.eq 23f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "b.eq 24f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 25f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 26f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "26:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "25:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "27:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z20.s, z12.b, z5.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z21.s, z13.b, z5.b[3]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "sdot z22.s, z14.b, z5.b[3]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "sdot z23.s, z15.b, z5.b[3]\n" + "b 9f\n" + "24:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 28f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 29f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "29:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "28:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "30:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "b 9f\n" + "23:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 31f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 32f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "32:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "31:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "33:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "b 9f\n" + "22:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 34f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 35f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "35:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "34:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "36:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "9:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq c_ptr1\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "c_ptr1 .req X2\n" + "c_ptr2 .req X3\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.b, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "whilelt p4.b, %[temp], %[width]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.b\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1rqb z2.b, p7/z, [a_ptr2]\n" + "mov z19.s, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mov z20.s, #0\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mov z21.s, #0\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "mov z22.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z23.s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "mov z24.s, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z25.s, #0\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "mov z26.s, #0\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "mov z27.s, #0\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "mul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "mul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "mul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "mul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z24.s, p0/z, [c_ptr2]\n" + "mul z20.s, p7/m, z20.s, z15.s\n" + "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "mul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" + "mul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" + "mul z23.s, p7/m, z23.s, z15.s\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mul z24.s, p7/m, z24.s, z15.s\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mul z25.s, p7/m, z25.s, z15.s\n" + "ld1rqb z2.b, p7/z, [a_ptr2]\n" + "mul z26.s, p7/m, z26.s, z15.s\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mul z27.s, p7/m, z27.s, z15.s\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z10.b, z10.b, z8.b\n" + "2:\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "cbz %[loops], 3f\n" + "4:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "zip1 z14.b, z15.b, z8.b\n" + "add a_ptr1, a_ptr1, #0x20\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "add a_ptr2, a_ptr2, #0x20\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "subs %[loops], %[loops], #0x1\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z24.s, z12.b, z2.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z25.s, z13.b, z2.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z14.b, z2.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n" + "sdot z27.s, z15.b, z2.b[3]\n" + "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z24.s, z8.b, z6.b[0]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z25.s, z9.b, z6.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z10.b, z6.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "sdot z27.s, z11.b, z6.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z24.s, z12.b, z6.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z25.s, z13.b, z6.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z14.b, z6.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "sdot z27.s, z15.b, z6.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z24.s, z8.b, z6.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z25.s, z9.b, z6.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z10.b, z6.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "sdot z27.s, z11.b, z6.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z5.b[3]\n" + "sdot z24.s, z12.b, z6.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z21.s, z13.b, z5.b[3]\n" + "sdot z25.s, z13.b, z6.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z14.b, z5.b[3]\n" + "sdot z26.s, z14.b, z6.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "sdot z23.s, z15.b, z5.b[3]\n" + "sdot z27.s, z15.b, z6.b[3]\n" + "b.ne 4b\n" + "3:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "cbz %[regs], 5f\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z24.s, z12.b, z2.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z25.s, z13.b, z2.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z14.b, z2.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n" + "sdot z27.s, z15.b, z2.b[3]\n" + "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z24.s, z8.b, z6.b[0]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z25.s, z9.b, z6.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z10.b, z6.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "sdot z27.s, z11.b, z6.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z24.s, z12.b, z6.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z25.s, z13.b, z6.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "sdot z26.s, z14.b, z6.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "sdot z27.s, z15.b, z6.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z24.s, z8.b, z6.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z25.s, z9.b, z6.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z26.s, z10.b, z6.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "sdot z27.s, z11.b, z6.b[2]\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z20.s, z12.b, z5.b[3]\n" + "sdot z24.s, z12.b, z6.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z21.s, z13.b, z5.b[3]\n" + "sdot z25.s, z13.b, z6.b[3]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "sdot z22.s, z14.b, z5.b[3]\n" + "sdot z26.s, z14.b, z6.b[3]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "sdot z23.s, z15.b, z5.b[3]\n" + "sdot z27.s, z15.b, z6.b[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "b.eq 7f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "b.eq 8f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 10f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 11f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "11:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "10:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "12:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z24.s, z12.b, z2.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z25.s, z13.b, z2.b[3]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "sdot z26.s, z14.b, z2.b[3]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "sdot z27.s, z15.b, z2.b[3]\n" + "b 9f\n" + "8:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 13f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 14f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "14:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "13:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "15:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "b 9f\n" + "7:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 16f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 17f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "17:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "16:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "18:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "b 9f\n" + "6:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 19f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 20f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "20:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "19:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "21:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "b 9f\n" + "5:\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z6.b, p6/z, [a_ptr2]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z24.s, z12.b, z2.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z25.s, z13.b, z2.b[3]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "sdot z26.s, z14.b, z2.b[3]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "sdot z27.s, z15.b, z2.b[3]\n" + "cbz %[blocks], 22f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z24.s, z8.b, z6.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z25.s, z9.b, z6.b[0]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "sdot z26.s, z10.b, z6.b[0]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "sdot z27.s, z11.b, z6.b[0]\n" + "b.eq 23f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z24.s, z12.b, z6.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z25.s, z13.b, z6.b[1]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "sdot z26.s, z14.b, z6.b[1]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "sdot z27.s, z15.b, z6.b[1]\n" + "b.eq 24f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z24.s, z8.b, z6.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z25.s, z9.b, z6.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "sdot z26.s, z10.b, z6.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "sdot z27.s, z11.b, z6.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 25f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 26f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "26:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "25:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "27:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z20.s, z12.b, z5.b[3]\n" + "sdot z24.s, z12.b, z6.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z21.s, z13.b, z5.b[3]\n" + "sdot z25.s, z13.b, z6.b[3]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "sdot z22.s, z14.b, z5.b[3]\n" + "sdot z26.s, z14.b, z6.b[3]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "sdot z23.s, z15.b, z5.b[3]\n" + "sdot z27.s, z15.b, z6.b[3]\n" + "b 9f\n" + "24:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 28f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 29f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "29:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "28:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "30:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z24.s, z8.b, z6.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z25.s, z9.b, z6.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "sdot z26.s, z10.b, z6.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "sdot z27.s, z11.b, z6.b[2]\n" + "b 9f\n" + "23:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 31f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 32f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "32:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "31:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "33:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z24.s, z12.b, z6.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z25.s, z13.b, z6.b[1]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "sdot z26.s, z14.b, z6.b[1]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "sdot z27.s, z15.b, z6.b[1]\n" + "b 9f\n" + "22:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 34f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 35f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "35:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "34:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "36:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z24.s, z8.b, z6.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z25.s, z9.b, z6.b[0]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "sdot z26.s, z10.b, z6.b[0]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "sdot z27.s, z11.b, z6.b[0]\n" + "9:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + "st1w z24.s, p0, [c_ptr2]\n" + "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" + "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" + "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory" + ); + break; + default: + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.b, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "whilelt p4.b, %[temp], %[width]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.b\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1rqb z2.b, p7/z, [a_ptr2]\n" + "mov z19.s, #0\n" + "ld1rqb z3.b, p7/z, [a_ptr3]\n" + "mov z20.s, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mov z21.s, #0\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mov z22.s, #0\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "mov z23.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z24.s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "mov z25.s, #0\n" + "add a_ptr3, a_ptr3, #0x10\n" + "mov z26.s, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "mov z27.s, #0\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "mov z28.s, #0\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "mul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "mul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "mul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "mul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z24.s, p0/z, [c_ptr2]\n" + "mul z20.s, p7/m, z20.s, z15.s\n" + "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "mul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" + "mul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" + "mul z23.s, p7/m, z23.s, z15.s\n" + "ld1w z28.s, p0/z, [c_ptr3]\n" + "mul z24.s, p7/m, z24.s, z15.s\n" + "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n" + "mul z25.s, p7/m, z25.s, z15.s\n" + "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n" + "mul z26.s, p7/m, z26.s, z15.s\n" + "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n" + "mul z27.s, p7/m, z27.s, z15.s\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mul z28.s, p7/m, z28.s, z15.s\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mul z29.s, p7/m, z29.s, z15.s\n" + "ld1rqb z2.b, p7/z, [a_ptr2]\n" + "mul z30.s, p7/m, z30.s, z15.s\n" + "ld1rqb z3.b, p7/z, [a_ptr3]\n" + "mul z31.s, p7/m, z31.s, z15.s\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add a_ptr3, a_ptr3, #0x10\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "2:\n" + "cbz %[loops], 3f\n" + "4:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "sdot z28.s, z8.b, z3.b[0]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z7.b, p7/z, [a_ptr3]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "zip2 z13.b, z13.b, z14.b\n" + "add a_ptr1, a_ptr1, #0x20\n" + "zip1 z14.b, z15.b, z8.b\n" + "add a_ptr2, a_ptr2, #0x20\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z29.s, z9.b, z3.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "add a_ptr3, a_ptr3, #0x20\n" + "sdot z30.s, z10.b, z3.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "subs %[loops], %[loops], #0x1\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "sdot z31.s, z11.b, z3.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "sdot z28.s, z12.b, z3.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "sdot z29.s, z13.b, z3.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "sdot z30.s, z14.b, z3.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "sdot z31.s, z15.b, z3.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "sdot z28.s, z8.b, z3.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "sdot z29.s, z9.b, z3.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "sdot z30.s, z10.b, z3.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "sdot z31.s, z11.b, z3.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z24.s, z12.b, z2.b[3]\n" + "sdot z28.s, z12.b, z3.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z25.s, z13.b, z2.b[3]\n" + "sdot z29.s, z13.b, z3.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "sdot z26.s, z14.b, z2.b[3]\n" + "sdot z30.s, z14.b, z3.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n" + "sdot z27.s, z15.b, z2.b[3]\n" + "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n" + "sdot z31.s, z15.b, z3.b[3]\n" + "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z24.s, z8.b, z6.b[0]\n" + "sdot z28.s, z8.b, z7.b[0]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z25.s, z9.b, z6.b[0]\n" + "sdot z29.s, z9.b, z7.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "sdot z26.s, z10.b, z6.b[0]\n" + "sdot z30.s, z10.b, z7.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "sdot z27.s, z11.b, z6.b[0]\n" + "sdot z31.s, z11.b, z7.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z24.s, z12.b, z6.b[1]\n" + "sdot z28.s, z12.b, z7.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z25.s, z13.b, z6.b[1]\n" + "sdot z29.s, z13.b, z7.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "sdot z26.s, z14.b, z6.b[1]\n" + "sdot z30.s, z14.b, z7.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "sdot z27.s, z15.b, z6.b[1]\n" + "sdot z31.s, z15.b, z7.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z24.s, z8.b, z6.b[2]\n" + "sdot z28.s, z8.b, z7.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z25.s, z9.b, z6.b[2]\n" + "sdot z29.s, z9.b, z7.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "sdot z26.s, z10.b, z6.b[2]\n" + "sdot z30.s, z10.b, z7.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "sdot z27.s, z11.b, z6.b[2]\n" + "sdot z31.s, z11.b, z7.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z5.b[3]\n" + "sdot z24.s, z12.b, z6.b[3]\n" + "sdot z28.s, z12.b, z7.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z21.s, z13.b, z5.b[3]\n" + "sdot z25.s, z13.b, z6.b[3]\n" + "sdot z29.s, z13.b, z7.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "sdot z22.s, z14.b, z5.b[3]\n" + "sdot z26.s, z14.b, z6.b[3]\n" + "sdot z30.s, z14.b, z7.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "sdot z23.s, z15.b, z5.b[3]\n" + "sdot z27.s, z15.b, z6.b[3]\n" + "sdot z31.s, z15.b, z7.b[3]\n" + "b.ne 4b\n" + "3:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "cbz %[regs], 5f\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "sdot z28.s, z8.b, z3.b[0]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z7.b, p7/z, [a_ptr3]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z29.s, z9.b, z3.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "sdot z30.s, z10.b, z3.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "sdot z31.s, z11.b, z3.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "sdot z28.s, z12.b, z3.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "sdot z29.s, z13.b, z3.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "sdot z30.s, z14.b, z3.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "sdot z31.s, z15.b, z3.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "sdot z28.s, z8.b, z3.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "sdot z29.s, z9.b, z3.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "sdot z30.s, z10.b, z3.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "sdot z31.s, z11.b, z3.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z24.s, z12.b, z2.b[3]\n" + "sdot z28.s, z12.b, z3.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z25.s, z13.b, z2.b[3]\n" + "sdot z29.s, z13.b, z3.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "sdot z26.s, z14.b, z2.b[3]\n" + "sdot z30.s, z14.b, z3.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n" + "sdot z27.s, z15.b, z2.b[3]\n" + "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n" + "sdot z31.s, z15.b, z3.b[3]\n" + "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z24.s, z8.b, z6.b[0]\n" + "sdot z28.s, z8.b, z7.b[0]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z25.s, z9.b, z6.b[0]\n" + "sdot z29.s, z9.b, z7.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "sdot z26.s, z10.b, z6.b[0]\n" + "sdot z30.s, z10.b, z7.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "sdot z27.s, z11.b, z6.b[0]\n" + "sdot z31.s, z11.b, z7.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z24.s, z12.b, z6.b[1]\n" + "sdot z28.s, z12.b, z7.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z25.s, z13.b, z6.b[1]\n" + "sdot z29.s, z13.b, z7.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "sdot z26.s, z14.b, z6.b[1]\n" + "sdot z30.s, z14.b, z7.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "sdot z27.s, z15.b, z6.b[1]\n" + "sdot z31.s, z15.b, z7.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z24.s, z8.b, z6.b[2]\n" + "sdot z28.s, z8.b, z7.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z25.s, z9.b, z6.b[2]\n" + "sdot z29.s, z9.b, z7.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "sdot z26.s, z10.b, z6.b[2]\n" + "sdot z30.s, z10.b, z7.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "sdot z27.s, z11.b, z6.b[2]\n" + "sdot z31.s, z11.b, z7.b[2]\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z20.s, z12.b, z5.b[3]\n" + "sdot z24.s, z12.b, z6.b[3]\n" + "sdot z28.s, z12.b, z7.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z21.s, z13.b, z5.b[3]\n" + "sdot z25.s, z13.b, z6.b[3]\n" + "sdot z29.s, z13.b, z7.b[3]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "sdot z22.s, z14.b, z5.b[3]\n" + "sdot z26.s, z14.b, z6.b[3]\n" + "sdot z30.s, z14.b, z7.b[3]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "sdot z23.s, z15.b, z5.b[3]\n" + "sdot z27.s, z15.b, z6.b[3]\n" + "sdot z31.s, z15.b, z7.b[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "sdot z28.s, z8.b, z3.b[0]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "sdot z29.s, z9.b, z3.b[0]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "sdot z30.s, z10.b, z3.b[0]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "sdot z31.s, z11.b, z3.b[0]\n" + "b.eq 7f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "sdot z28.s, z12.b, z3.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "sdot z29.s, z13.b, z3.b[1]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "sdot z30.s, z14.b, z3.b[1]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "sdot z31.s, z15.b, z3.b[1]\n" + "b.eq 8f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "sdot z28.s, z8.b, z3.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "sdot z29.s, z9.b, z3.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "sdot z30.s, z10.b, z3.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "sdot z31.s, z11.b, z3.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 10f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 11f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "11:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "10:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "12:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z24.s, z12.b, z2.b[3]\n" + "sdot z28.s, z12.b, z3.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z25.s, z13.b, z2.b[3]\n" + "sdot z29.s, z13.b, z3.b[3]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "sdot z26.s, z14.b, z2.b[3]\n" + "sdot z30.s, z14.b, z3.b[3]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "sdot z27.s, z15.b, z2.b[3]\n" + "sdot z31.s, z15.b, z3.b[3]\n" + "b 9f\n" + "8:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 13f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 14f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "14:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "13:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "15:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "sdot z28.s, z8.b, z3.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "sdot z29.s, z9.b, z3.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "sdot z30.s, z10.b, z3.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "sdot z31.s, z11.b, z3.b[2]\n" + "b 9f\n" + "7:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 16f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 17f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "17:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "16:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "18:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "sdot z28.s, z12.b, z3.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "sdot z29.s, z13.b, z3.b[1]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "sdot z30.s, z14.b, z3.b[1]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "sdot z31.s, z15.b, z3.b[1]\n" + "b 9f\n" + "6:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 19f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 20f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "20:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "19:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "21:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "sdot z28.s, z8.b, z3.b[0]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "sdot z29.s, z9.b, z3.b[0]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "sdot z30.s, z10.b, z3.b[0]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "sdot z31.s, z11.b, z3.b[0]\n" + "b 9f\n" + "5:\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "sdot z28.s, z8.b, z3.b[0]\n" + "ld1rqb z6.b, p6/z, [a_ptr2]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z7.b, p6/z, [a_ptr3]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z29.s, z9.b, z3.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "sdot z30.s, z10.b, z3.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "sdot z31.s, z11.b, z3.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "sdot z28.s, z12.b, z3.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "sdot z29.s, z13.b, z3.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "sdot z30.s, z14.b, z3.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "sdot z31.s, z15.b, z3.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "sdot z28.s, z8.b, z3.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "sdot z29.s, z9.b, z3.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "sdot z30.s, z10.b, z3.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "sdot z31.s, z11.b, z3.b[2]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z24.s, z12.b, z2.b[3]\n" + "sdot z28.s, z12.b, z3.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z25.s, z13.b, z2.b[3]\n" + "sdot z29.s, z13.b, z3.b[3]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "sdot z26.s, z14.b, z2.b[3]\n" + "sdot z30.s, z14.b, z3.b[3]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "sdot z27.s, z15.b, z2.b[3]\n" + "sdot z31.s, z15.b, z3.b[3]\n" + "cbz %[blocks], 22f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z24.s, z8.b, z6.b[0]\n" + "sdot z28.s, z8.b, z7.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z25.s, z9.b, z6.b[0]\n" + "sdot z29.s, z9.b, z7.b[0]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "sdot z26.s, z10.b, z6.b[0]\n" + "sdot z30.s, z10.b, z7.b[0]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "sdot z27.s, z11.b, z6.b[0]\n" + "sdot z31.s, z11.b, z7.b[0]\n" + "b.eq 23f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z24.s, z12.b, z6.b[1]\n" + "sdot z28.s, z12.b, z7.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z25.s, z13.b, z6.b[1]\n" + "sdot z29.s, z13.b, z7.b[1]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "sdot z26.s, z14.b, z6.b[1]\n" + "sdot z30.s, z14.b, z7.b[1]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "sdot z27.s, z15.b, z6.b[1]\n" + "sdot z31.s, z15.b, z7.b[1]\n" + "b.eq 24f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z24.s, z8.b, z6.b[2]\n" + "sdot z28.s, z8.b, z7.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z25.s, z9.b, z6.b[2]\n" + "sdot z29.s, z9.b, z7.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "sdot z26.s, z10.b, z6.b[2]\n" + "sdot z30.s, z10.b, z7.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "sdot z27.s, z11.b, z6.b[2]\n" + "sdot z31.s, z11.b, z7.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 25f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 26f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "26:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "25:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "27:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z20.s, z12.b, z5.b[3]\n" + "sdot z24.s, z12.b, z6.b[3]\n" + "sdot z28.s, z12.b, z7.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z21.s, z13.b, z5.b[3]\n" + "sdot z25.s, z13.b, z6.b[3]\n" + "sdot z29.s, z13.b, z7.b[3]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "sdot z22.s, z14.b, z5.b[3]\n" + "sdot z26.s, z14.b, z6.b[3]\n" + "sdot z30.s, z14.b, z7.b[3]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "sdot z23.s, z15.b, z5.b[3]\n" + "sdot z27.s, z15.b, z6.b[3]\n" + "sdot z31.s, z15.b, z7.b[3]\n" + "b 9f\n" + "24:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 28f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 29f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "29:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "28:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "30:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z24.s, z8.b, z6.b[2]\n" + "sdot z28.s, z8.b, z7.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z25.s, z9.b, z6.b[2]\n" + "sdot z29.s, z9.b, z7.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "sdot z26.s, z10.b, z6.b[2]\n" + "sdot z30.s, z10.b, z7.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "sdot z27.s, z11.b, z6.b[2]\n" + "sdot z31.s, z11.b, z7.b[2]\n" + "b 9f\n" + "23:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 31f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 32f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "32:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "31:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "33:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z24.s, z12.b, z6.b[1]\n" + "sdot z28.s, z12.b, z7.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z25.s, z13.b, z6.b[1]\n" + "sdot z29.s, z13.b, z7.b[1]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "sdot z26.s, z14.b, z6.b[1]\n" + "sdot z30.s, z14.b, z7.b[1]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "sdot z27.s, z15.b, z6.b[1]\n" + "sdot z31.s, z15.b, z7.b[1]\n" + "b 9f\n" + "22:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 34f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 35f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "35:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "34:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "36:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z24.s, z8.b, z6.b[0]\n" + "sdot z28.s, z8.b, z7.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z25.s, z9.b, z6.b[0]\n" + "sdot z29.s, z9.b, z7.b[0]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "sdot z26.s, z10.b, z6.b[0]\n" + "sdot z30.s, z10.b, z7.b[0]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "sdot z27.s, z11.b, z6.b[0]\n" + "sdot z31.s, z11.b, z7.b[0]\n" + "9:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + "st1w z24.s, p0, [c_ptr2]\n" + "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" + "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" + "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" + "st1w z28.s, p0, [c_ptr3]\n" + "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n" + "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n" + "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" + ); + break; + } + } + } +} + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp new file mode 100644 index 0000000000..f5ebad8565 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __ARM_FEATURE_SVE + +#include + + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_native_u8u32_dot_4VLx4(const uint8_t *, int, const uint8_t *, int ldb, uint32_t *, int, uint32_t, int, int, int); + +class native_u8u32_dot_4VLx4 +{ +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, int ldb, uint32_t *, int, uint32_t, int, int, int); + + /* Kernel blocking parameters */ + static int out_height() + { + return 4; + } + + static int out_width() + { + return get_vector_length() * 4; + } + + static int k_unroll() + { + return 4; + } + + + + // Default to the generic kernel + kern_type kernel=sve_native_u8u32_dot_4VLx4; + + native_u8u32_dot_4VLx4(const CPUInfo *ci) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp new file mode 100644 index 0000000000..7d89948dc1 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp @@ -0,0 +1,4632 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include + +#include +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int ldb, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) { + const long beta0 = (beta == 0u); + const long loops_count = ((K + 16) / 32) - 1; + K -= loops_count * 32; + const long regs_count = (K / 16) - 1; + K -= (regs_count + 1) * 16; + const long leftovers = K; + const long blocks_count = K / 4; + const long odds_count = K - (blocks_count * 4); + + for (int y=0; y())) { + const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); + const uint32_t *betaptr = β + long loops = loops_count; + long regs = regs_count; + long temp = 0; + long blocks = blocks_count; + long odds = odds_count; + const uint8_t *a_ptr0 = a_ptr0_base; + const uint8_t *b_ptr0 = B + x0; + const uint8_t *b_ptr1 = b_ptr0 + ldb; + const uint8_t *b_ptr2 = b_ptr1 + ldb; + const uint8_t *b_ptr3 = b_ptr2 + ldb; + long ldbb = ldb * sizeof(uint8_t) * 4; + + switch(M-y) { + case 1: + __asm __volatile ( + "whilelt p6.b, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "whilelt p4.b, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.b\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mov z18.s, #0\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mov z19.s, #0\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mul z16.s, p7/m, z16.s, z15.s\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mul z17.s, p7/m, z17.s, z15.s\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mul z18.s, p7/m, z18.s, z15.s\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "mul z19.s, p7/m, z19.s, z15.s\n" + "2:\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "cbz %[loops], 3f\n" + "4:\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "zip2 z13.b, z13.b, z14.b\n" + "subs %[loops], %[loops], #0x1\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "b.ne 4b\n" + "3:\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "cbz %[regs], 5f\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "b.eq 7f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "b.eq 8f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 10f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 11f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "11:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "10:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "12:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "b 9f\n" + "8:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 13f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 14f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "14:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "13:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "15:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "b 9f\n" + "7:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 16f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 17f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "17:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "16:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "18:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "b 9f\n" + "6:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 19f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 20f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "20:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "19:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "21:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "b 9f\n" + "5:\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "cbz %[blocks], 22f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "b.eq 23f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "b.eq 24f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 25f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 26f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "26:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "25:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "27:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "b 9f\n" + "24:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 28f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 29f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "29:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "28:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "30:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "b 9f\n" + "23:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 31f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 32f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "32:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "31:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "33:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "b 9f\n" + "22:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 34f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 35f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "35:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "34:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "36:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "9:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "c_ptr1 .req X1\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.b, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "whilelt p4.b, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.b\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mov z19.s, #0\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mov z20.s, #0\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "mov z21.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z22.s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "mov z23.s, #0\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "mul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "mul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "mul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "mul z19.s, p7/m, z19.s, z15.s\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mul z20.s, p7/m, z20.s, z15.s\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mul z21.s, p7/m, z21.s, z15.s\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mul z22.s, p7/m, z22.s, z15.s\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mul z23.s, p7/m, z23.s, z15.s\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip1 z10.b, z10.b, z8.b\n" + "2:\n" + "cbz %[loops], 3f\n" + "4:\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "zip2 z11.b, z8.b, z9.b\n" + "add a_ptr1, a_ptr1, #0x20\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "subs %[loops], %[loops], #0x1\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z21.s, z9.b, z1.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z18.s, z14.b, z0.b[3]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z20.s, z12.b, z5.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z21.s, z13.b, z5.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z14.b, z5.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z19.s, z15.b, z4.b[3]\n" + "udot z23.s, z15.b, z5.b[3]\n" + "b.ne 4b\n" + "3:\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "cbz %[regs], 5f\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z22.s, z10.b, z1.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z23.s, z11.b, z5.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z20.s, z12.b, z5.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z21.s, z13.b, z5.b[3]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "udot z22.s, z14.b, z5.b[3]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "udot z23.s, z15.b, z5.b[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "b.eq 7f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "b.eq 8f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 10f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 11f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "11:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "10:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "12:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "b 9f\n" + "8:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 13f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 14f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "14:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "13:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "15:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "b 9f\n" + "7:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 16f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 17f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "17:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "16:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "18:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "b 9f\n" + "6:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 19f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 20f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "20:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "19:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "21:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "b 9f\n" + "5:\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z23.s, z11.b, z1.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "cbz %[blocks], 22f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "b.eq 23f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "b.eq 24f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 25f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 26f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "26:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "25:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "27:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z20.s, z12.b, z5.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z21.s, z13.b, z5.b[3]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "udot z22.s, z14.b, z5.b[3]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "udot z23.s, z15.b, z5.b[3]\n" + "b 9f\n" + "24:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 28f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 29f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "29:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "28:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "30:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "b 9f\n" + "23:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 31f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 32f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "32:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "31:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "33:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "b 9f\n" + "22:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 34f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 35f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "35:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "34:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "36:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "9:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq c_ptr1\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "c_ptr1 .req X2\n" + "c_ptr2 .req X3\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.b, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "whilelt p4.b, %[temp], %[width]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.b\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1rqb z2.b, p7/z, [a_ptr2]\n" + "mov z19.s, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mov z20.s, #0\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mov z21.s, #0\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "mov z22.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z23.s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "mov z24.s, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z25.s, #0\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "mov z26.s, #0\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "mov z27.s, #0\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "mul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "mul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "mul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "mul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z24.s, p0/z, [c_ptr2]\n" + "mul z20.s, p7/m, z20.s, z15.s\n" + "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "mul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" + "mul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" + "mul z23.s, p7/m, z23.s, z15.s\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mul z24.s, p7/m, z24.s, z15.s\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mul z25.s, p7/m, z25.s, z15.s\n" + "ld1rqb z2.b, p7/z, [a_ptr2]\n" + "mul z26.s, p7/m, z26.s, z15.s\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mul z27.s, p7/m, z27.s, z15.s\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z10.b, z10.b, z8.b\n" + "2:\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "cbz %[loops], 3f\n" + "4:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "zip1 z14.b, z15.b, z8.b\n" + "add a_ptr1, a_ptr1, #0x20\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "add a_ptr2, a_ptr2, #0x20\n" + "udot z27.s, z11.b, z2.b[0]\n" + "subs %[loops], %[loops], #0x1\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z10.b, z2.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z24.s, z12.b, z2.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z25.s, z13.b, z2.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z14.b, z2.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n" + "udot z27.s, z15.b, z2.b[3]\n" + "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z24.s, z8.b, z6.b[0]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z25.s, z9.b, z6.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z10.b, z6.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "udot z27.s, z11.b, z6.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z24.s, z12.b, z6.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z25.s, z13.b, z6.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z14.b, z6.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "udot z27.s, z15.b, z6.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z4.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z24.s, z8.b, z6.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z25.s, z9.b, z6.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z10.b, z6.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "udot z27.s, z11.b, z6.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z4.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z5.b[3]\n" + "udot z24.s, z12.b, z6.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z21.s, z13.b, z5.b[3]\n" + "udot z25.s, z13.b, z6.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z14.b, z5.b[3]\n" + "udot z26.s, z14.b, z6.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "udot z23.s, z15.b, z5.b[3]\n" + "udot z27.s, z15.b, z6.b[3]\n" + "b.ne 4b\n" + "3:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "cbz %[regs], 5f\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z27.s, z11.b, z2.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z10.b, z2.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z24.s, z12.b, z2.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z25.s, z13.b, z2.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z14.b, z2.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n" + "udot z27.s, z15.b, z2.b[3]\n" + "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z24.s, z8.b, z6.b[0]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z25.s, z9.b, z6.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z10.b, z6.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "udot z27.s, z11.b, z6.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z24.s, z12.b, z6.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z25.s, z13.b, z6.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "udot z26.s, z14.b, z6.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "udot z27.s, z15.b, z6.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z24.s, z8.b, z6.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z25.s, z9.b, z6.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z26.s, z10.b, z6.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "udot z27.s, z11.b, z6.b[2]\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z20.s, z12.b, z5.b[3]\n" + "udot z24.s, z12.b, z6.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z21.s, z13.b, z5.b[3]\n" + "udot z25.s, z13.b, z6.b[3]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "udot z22.s, z14.b, z5.b[3]\n" + "udot z26.s, z14.b, z6.b[3]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "udot z23.s, z15.b, z5.b[3]\n" + "udot z27.s, z15.b, z6.b[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z27.s, z11.b, z2.b[0]\n" + "b.eq 7f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "b.eq 8f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z26.s, z10.b, z2.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 10f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 11f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "11:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "10:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "12:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z24.s, z12.b, z2.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z25.s, z13.b, z2.b[3]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "udot z26.s, z14.b, z2.b[3]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "udot z27.s, z15.b, z2.b[3]\n" + "b 9f\n" + "8:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 13f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 14f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "14:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "13:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "15:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z26.s, z10.b, z2.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "b 9f\n" + "7:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 16f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 17f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "17:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "16:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "18:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "b 9f\n" + "6:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 19f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 20f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "20:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "19:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "21:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z27.s, z11.b, z2.b[0]\n" + "b 9f\n" + "5:\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z6.b, p6/z, [a_ptr2]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z27.s, z11.b, z2.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z26.s, z10.b, z2.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z24.s, z12.b, z2.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z25.s, z13.b, z2.b[3]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "udot z26.s, z14.b, z2.b[3]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "udot z27.s, z15.b, z2.b[3]\n" + "cbz %[blocks], 22f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z24.s, z8.b, z6.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z25.s, z9.b, z6.b[0]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "udot z26.s, z10.b, z6.b[0]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "udot z27.s, z11.b, z6.b[0]\n" + "b.eq 23f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z24.s, z12.b, z6.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z25.s, z13.b, z6.b[1]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "udot z26.s, z14.b, z6.b[1]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "udot z27.s, z15.b, z6.b[1]\n" + "b.eq 24f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z24.s, z8.b, z6.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z25.s, z9.b, z6.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "udot z26.s, z10.b, z6.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "udot z27.s, z11.b, z6.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 25f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 26f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "26:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "25:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "27:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z20.s, z12.b, z5.b[3]\n" + "udot z24.s, z12.b, z6.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z21.s, z13.b, z5.b[3]\n" + "udot z25.s, z13.b, z6.b[3]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "udot z22.s, z14.b, z5.b[3]\n" + "udot z26.s, z14.b, z6.b[3]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "udot z23.s, z15.b, z5.b[3]\n" + "udot z27.s, z15.b, z6.b[3]\n" + "b 9f\n" + "24:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 28f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 29f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "29:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "28:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "30:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z24.s, z8.b, z6.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z25.s, z9.b, z6.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "udot z26.s, z10.b, z6.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "udot z27.s, z11.b, z6.b[2]\n" + "b 9f\n" + "23:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 31f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 32f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "32:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "31:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "33:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z24.s, z12.b, z6.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z25.s, z13.b, z6.b[1]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "udot z26.s, z14.b, z6.b[1]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "udot z27.s, z15.b, z6.b[1]\n" + "b 9f\n" + "22:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 34f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 35f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "35:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "34:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "36:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z24.s, z8.b, z6.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z25.s, z9.b, z6.b[0]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "udot z26.s, z10.b, z6.b[0]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "udot z27.s, z11.b, z6.b[0]\n" + "9:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + "st1w z24.s, p0, [c_ptr2]\n" + "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" + "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" + "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory" + ); + break; + default: + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.b, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "whilelt p4.b, %[temp], %[width]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.b\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1rqb z2.b, p7/z, [a_ptr2]\n" + "mov z19.s, #0\n" + "ld1rqb z3.b, p7/z, [a_ptr3]\n" + "mov z20.s, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mov z21.s, #0\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mov z22.s, #0\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "mov z23.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z24.s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "mov z25.s, #0\n" + "add a_ptr3, a_ptr3, #0x10\n" + "mov z26.s, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "mov z27.s, #0\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "mov z28.s, #0\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "mul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "mul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "mul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "mul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z24.s, p0/z, [c_ptr2]\n" + "mul z20.s, p7/m, z20.s, z15.s\n" + "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "mul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" + "mul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" + "mul z23.s, p7/m, z23.s, z15.s\n" + "ld1w z28.s, p0/z, [c_ptr3]\n" + "mul z24.s, p7/m, z24.s, z15.s\n" + "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n" + "mul z25.s, p7/m, z25.s, z15.s\n" + "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n" + "mul z26.s, p7/m, z26.s, z15.s\n" + "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n" + "mul z27.s, p7/m, z27.s, z15.s\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mul z28.s, p7/m, z28.s, z15.s\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mul z29.s, p7/m, z29.s, z15.s\n" + "ld1rqb z2.b, p7/z, [a_ptr2]\n" + "mul z30.s, p7/m, z30.s, z15.s\n" + "ld1rqb z3.b, p7/z, [a_ptr3]\n" + "mul z31.s, p7/m, z31.s, z15.s\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add a_ptr3, a_ptr3, #0x10\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "2:\n" + "cbz %[loops], 3f\n" + "4:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "udot z28.s, z8.b, z3.b[0]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z7.b, p7/z, [a_ptr3]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "zip2 z13.b, z13.b, z14.b\n" + "add a_ptr1, a_ptr1, #0x20\n" + "zip1 z14.b, z15.b, z8.b\n" + "add a_ptr2, a_ptr2, #0x20\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z29.s, z9.b, z3.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "add a_ptr3, a_ptr3, #0x20\n" + "udot z30.s, z10.b, z3.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "subs %[loops], %[loops], #0x1\n" + "udot z27.s, z11.b, z2.b[0]\n" + "udot z31.s, z11.b, z3.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "udot z28.s, z12.b, z3.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "udot z29.s, z13.b, z3.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "udot z30.s, z14.b, z3.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "udot z31.s, z15.b, z3.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "udot z28.s, z8.b, z3.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "udot z29.s, z9.b, z3.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z26.s, z10.b, z2.b[2]\n" + "udot z30.s, z10.b, z3.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "udot z31.s, z11.b, z3.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z24.s, z12.b, z2.b[3]\n" + "udot z28.s, z12.b, z3.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z25.s, z13.b, z2.b[3]\n" + "udot z29.s, z13.b, z3.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "udot z26.s, z14.b, z2.b[3]\n" + "udot z30.s, z14.b, z3.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n" + "udot z27.s, z15.b, z2.b[3]\n" + "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n" + "udot z31.s, z15.b, z3.b[3]\n" + "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z24.s, z8.b, z6.b[0]\n" + "udot z28.s, z8.b, z7.b[0]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z25.s, z9.b, z6.b[0]\n" + "udot z29.s, z9.b, z7.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "udot z26.s, z10.b, z6.b[0]\n" + "udot z30.s, z10.b, z7.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "udot z27.s, z11.b, z6.b[0]\n" + "udot z31.s, z11.b, z7.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z24.s, z12.b, z6.b[1]\n" + "udot z28.s, z12.b, z7.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z25.s, z13.b, z6.b[1]\n" + "udot z29.s, z13.b, z7.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "udot z26.s, z14.b, z6.b[1]\n" + "udot z30.s, z14.b, z7.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "udot z27.s, z15.b, z6.b[1]\n" + "udot z31.s, z15.b, z7.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z4.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z24.s, z8.b, z6.b[2]\n" + "udot z28.s, z8.b, z7.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z25.s, z9.b, z6.b[2]\n" + "udot z29.s, z9.b, z7.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "udot z26.s, z10.b, z6.b[2]\n" + "udot z30.s, z10.b, z7.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "udot z27.s, z11.b, z6.b[2]\n" + "udot z31.s, z11.b, z7.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z4.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z5.b[3]\n" + "udot z24.s, z12.b, z6.b[3]\n" + "udot z28.s, z12.b, z7.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z21.s, z13.b, z5.b[3]\n" + "udot z25.s, z13.b, z6.b[3]\n" + "udot z29.s, z13.b, z7.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "udot z22.s, z14.b, z5.b[3]\n" + "udot z26.s, z14.b, z6.b[3]\n" + "udot z30.s, z14.b, z7.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "udot z23.s, z15.b, z5.b[3]\n" + "udot z27.s, z15.b, z6.b[3]\n" + "udot z31.s, z15.b, z7.b[3]\n" + "b.ne 4b\n" + "3:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "cbz %[regs], 5f\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "udot z28.s, z8.b, z3.b[0]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z7.b, p7/z, [a_ptr3]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z29.s, z9.b, z3.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "udot z30.s, z10.b, z3.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z27.s, z11.b, z2.b[0]\n" + "udot z31.s, z11.b, z3.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "udot z28.s, z12.b, z3.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "udot z29.s, z13.b, z3.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "udot z30.s, z14.b, z3.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "udot z31.s, z15.b, z3.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "udot z28.s, z8.b, z3.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "udot z29.s, z9.b, z3.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z26.s, z10.b, z2.b[2]\n" + "udot z30.s, z10.b, z3.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "udot z31.s, z11.b, z3.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z24.s, z12.b, z2.b[3]\n" + "udot z28.s, z12.b, z3.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z25.s, z13.b, z2.b[3]\n" + "udot z29.s, z13.b, z3.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "udot z26.s, z14.b, z2.b[3]\n" + "udot z30.s, z14.b, z3.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n" + "udot z27.s, z15.b, z2.b[3]\n" + "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n" + "udot z31.s, z15.b, z3.b[3]\n" + "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z24.s, z8.b, z6.b[0]\n" + "udot z28.s, z8.b, z7.b[0]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z25.s, z9.b, z6.b[0]\n" + "udot z29.s, z9.b, z7.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "udot z26.s, z10.b, z6.b[0]\n" + "udot z30.s, z10.b, z7.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "udot z27.s, z11.b, z6.b[0]\n" + "udot z31.s, z11.b, z7.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z24.s, z12.b, z6.b[1]\n" + "udot z28.s, z12.b, z7.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z25.s, z13.b, z6.b[1]\n" + "udot z29.s, z13.b, z7.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "udot z26.s, z14.b, z6.b[1]\n" + "udot z30.s, z14.b, z7.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "udot z27.s, z15.b, z6.b[1]\n" + "udot z31.s, z15.b, z7.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z24.s, z8.b, z6.b[2]\n" + "udot z28.s, z8.b, z7.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z25.s, z9.b, z6.b[2]\n" + "udot z29.s, z9.b, z7.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "udot z26.s, z10.b, z6.b[2]\n" + "udot z30.s, z10.b, z7.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "udot z27.s, z11.b, z6.b[2]\n" + "udot z31.s, z11.b, z7.b[2]\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z20.s, z12.b, z5.b[3]\n" + "udot z24.s, z12.b, z6.b[3]\n" + "udot z28.s, z12.b, z7.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z21.s, z13.b, z5.b[3]\n" + "udot z25.s, z13.b, z6.b[3]\n" + "udot z29.s, z13.b, z7.b[3]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "udot z22.s, z14.b, z5.b[3]\n" + "udot z26.s, z14.b, z6.b[3]\n" + "udot z30.s, z14.b, z7.b[3]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "udot z23.s, z15.b, z5.b[3]\n" + "udot z27.s, z15.b, z6.b[3]\n" + "udot z31.s, z15.b, z7.b[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "udot z28.s, z8.b, z3.b[0]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "udot z29.s, z9.b, z3.b[0]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "udot z30.s, z10.b, z3.b[0]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z27.s, z11.b, z2.b[0]\n" + "udot z31.s, z11.b, z3.b[0]\n" + "b.eq 7f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "udot z28.s, z12.b, z3.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "udot z29.s, z13.b, z3.b[1]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "udot z30.s, z14.b, z3.b[1]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "udot z31.s, z15.b, z3.b[1]\n" + "b.eq 8f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "udot z28.s, z8.b, z3.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "udot z29.s, z9.b, z3.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z26.s, z10.b, z2.b[2]\n" + "udot z30.s, z10.b, z3.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "udot z31.s, z11.b, z3.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 10f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 11f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "11:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "10:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "12:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z24.s, z12.b, z2.b[3]\n" + "udot z28.s, z12.b, z3.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z25.s, z13.b, z2.b[3]\n" + "udot z29.s, z13.b, z3.b[3]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "udot z26.s, z14.b, z2.b[3]\n" + "udot z30.s, z14.b, z3.b[3]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "udot z27.s, z15.b, z2.b[3]\n" + "udot z31.s, z15.b, z3.b[3]\n" + "b 9f\n" + "8:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 13f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 14f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "14:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "13:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "15:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "udot z28.s, z8.b, z3.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "udot z29.s, z9.b, z3.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z26.s, z10.b, z2.b[2]\n" + "udot z30.s, z10.b, z3.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "udot z31.s, z11.b, z3.b[2]\n" + "b 9f\n" + "7:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 16f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 17f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "17:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "16:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "18:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "udot z28.s, z12.b, z3.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "udot z29.s, z13.b, z3.b[1]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "udot z30.s, z14.b, z3.b[1]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "udot z31.s, z15.b, z3.b[1]\n" + "b 9f\n" + "6:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 19f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 20f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "20:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "19:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "21:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "udot z28.s, z8.b, z3.b[0]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "udot z29.s, z9.b, z3.b[0]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "udot z30.s, z10.b, z3.b[0]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z27.s, z11.b, z2.b[0]\n" + "udot z31.s, z11.b, z3.b[0]\n" + "b 9f\n" + "5:\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "udot z28.s, z8.b, z3.b[0]\n" + "ld1rqb z6.b, p6/z, [a_ptr2]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z7.b, p6/z, [a_ptr3]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z29.s, z9.b, z3.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "udot z30.s, z10.b, z3.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z27.s, z11.b, z2.b[0]\n" + "udot z31.s, z11.b, z3.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "udot z28.s, z12.b, z3.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "udot z29.s, z13.b, z3.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "udot z30.s, z14.b, z3.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "udot z31.s, z15.b, z3.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "udot z28.s, z8.b, z3.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "udot z29.s, z9.b, z3.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z26.s, z10.b, z2.b[2]\n" + "udot z30.s, z10.b, z3.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "udot z31.s, z11.b, z3.b[2]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z24.s, z12.b, z2.b[3]\n" + "udot z28.s, z12.b, z3.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z25.s, z13.b, z2.b[3]\n" + "udot z29.s, z13.b, z3.b[3]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "udot z26.s, z14.b, z2.b[3]\n" + "udot z30.s, z14.b, z3.b[3]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "udot z27.s, z15.b, z2.b[3]\n" + "udot z31.s, z15.b, z3.b[3]\n" + "cbz %[blocks], 22f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z24.s, z8.b, z6.b[0]\n" + "udot z28.s, z8.b, z7.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z25.s, z9.b, z6.b[0]\n" + "udot z29.s, z9.b, z7.b[0]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "udot z26.s, z10.b, z6.b[0]\n" + "udot z30.s, z10.b, z7.b[0]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "udot z27.s, z11.b, z6.b[0]\n" + "udot z31.s, z11.b, z7.b[0]\n" + "b.eq 23f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z24.s, z12.b, z6.b[1]\n" + "udot z28.s, z12.b, z7.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z25.s, z13.b, z6.b[1]\n" + "udot z29.s, z13.b, z7.b[1]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "udot z26.s, z14.b, z6.b[1]\n" + "udot z30.s, z14.b, z7.b[1]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "udot z27.s, z15.b, z6.b[1]\n" + "udot z31.s, z15.b, z7.b[1]\n" + "b.eq 24f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z24.s, z8.b, z6.b[2]\n" + "udot z28.s, z8.b, z7.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z25.s, z9.b, z6.b[2]\n" + "udot z29.s, z9.b, z7.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "udot z26.s, z10.b, z6.b[2]\n" + "udot z30.s, z10.b, z7.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "udot z27.s, z11.b, z6.b[2]\n" + "udot z31.s, z11.b, z7.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 25f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 26f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "26:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "25:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "27:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z20.s, z12.b, z5.b[3]\n" + "udot z24.s, z12.b, z6.b[3]\n" + "udot z28.s, z12.b, z7.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z21.s, z13.b, z5.b[3]\n" + "udot z25.s, z13.b, z6.b[3]\n" + "udot z29.s, z13.b, z7.b[3]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "udot z22.s, z14.b, z5.b[3]\n" + "udot z26.s, z14.b, z6.b[3]\n" + "udot z30.s, z14.b, z7.b[3]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "udot z23.s, z15.b, z5.b[3]\n" + "udot z27.s, z15.b, z6.b[3]\n" + "udot z31.s, z15.b, z7.b[3]\n" + "b 9f\n" + "24:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 28f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 29f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "29:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "28:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "30:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z24.s, z8.b, z6.b[2]\n" + "udot z28.s, z8.b, z7.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z25.s, z9.b, z6.b[2]\n" + "udot z29.s, z9.b, z7.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "udot z26.s, z10.b, z6.b[2]\n" + "udot z30.s, z10.b, z7.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "udot z27.s, z11.b, z6.b[2]\n" + "udot z31.s, z11.b, z7.b[2]\n" + "b 9f\n" + "23:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 31f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 32f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "32:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "31:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "33:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z24.s, z12.b, z6.b[1]\n" + "udot z28.s, z12.b, z7.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z25.s, z13.b, z6.b[1]\n" + "udot z29.s, z13.b, z7.b[1]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "udot z26.s, z14.b, z6.b[1]\n" + "udot z30.s, z14.b, z7.b[1]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "udot z27.s, z15.b, z6.b[1]\n" + "udot z31.s, z15.b, z7.b[1]\n" + "b 9f\n" + "22:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 34f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 35f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "35:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "34:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "36:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z24.s, z8.b, z6.b[0]\n" + "udot z28.s, z8.b, z7.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z25.s, z9.b, z6.b[0]\n" + "udot z29.s, z9.b, z7.b[0]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "udot z26.s, z10.b, z6.b[0]\n" + "udot z30.s, z10.b, z7.b[0]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "udot z27.s, z11.b, z6.b[0]\n" + "udot z31.s, z11.b, z7.b[0]\n" + "9:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + "st1w z24.s, p0, [c_ptr2]\n" + "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" + "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" + "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" + "st1w z28.s, p0, [c_ptr3]\n" + "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n" + "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n" + "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" + ); + break; + } + } + } +} + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp new file mode 100644 index 0000000000..80b216ca14 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __ARM_FEATURE_SVE + + + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_smallK_fp32_mla_1VLx4(const float *, int, const float *, int ldb, float *, int, float, int, int, int); + +class smallK_fp32_mla_1VLx4 +{ +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, float, int, int, int); + + /* Kernel blocking parameters */ + static int out_height() + { + return 4; + } + + static int out_width() + { + return get_vector_length() * 1; + } + + static int k_unroll() + { + return 1; + } + + + + // Default to the generic kernel + kern_type kernel=sve_smallK_fp32_mla_1VLx4; + + smallK_fp32_mla_1VLx4(const CPUInfo *ci) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp new file mode 100644 index 0000000000..e2cc1d14e2 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp @@ -0,0 +1,4264 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include + + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void sve_smallK_fp32_mla_1VLx4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K) { + const long beta0 = (beta == 0.0f); + + const long loops_count = M / 4; + const long oddrow_count = M % 4; + const long ldab = lda * sizeof(float); + const long ldcb = ldc * sizeof(float); + const long odd_depth = K % 4; + const float *betaptr = β + long ldbb = ldb * sizeof(float); + + for (int x0=0; x0() * 1)) { + const long width = std::min((unsigned long)N-x0, (get_vector_length() * 1)); + long loops = loops_count; + long oddrows = oddrow_count; + long temp = 0; + const float *b_ptr0 = B + x0; + + const float *a_ptr0 = A; + + float *c_ptr0 = C + x0; + + switch(K) { + case 1: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "ld1rqw z1.s, p6/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "ld1rqw z2.s, p6/z, [a_ptr2]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "ld1rqw z3.s, p6/z, [a_ptr3]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "ld1rqw z1.s, p6/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "ld1rqw z2.s, p6/z, [a_ptr2]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "ld1rqw z3.s, p6/z, [a_ptr3]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "ld1rqw z1.s, p6/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "ld1rqw z2.s, p6/z, [a_ptr2]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "ld1rqw z3.s, p6/z, [a_ptr3]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 5: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 6: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 7: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 8: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 9: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 10: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 11: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 12: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 13: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 14: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 15: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 16: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z19.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 17: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z19.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z20.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 18: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z19.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z20.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z21.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 19: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z19.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z20.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z21.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z22.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 20: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z19.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z20.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z21.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z22.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z23.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 21: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z19.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z20.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z21.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z22.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z23.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z24.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z24.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z24.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z24.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z24.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z28.s, z24.s, z1.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 22: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z19.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z20.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z21.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z22.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z23.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z24.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z25.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z24.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z24.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z24.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z24.s, z3.s[0]\n" + "fmla z28.s, z25.s, z0.s[1]\n" + "fmla z29.s, z25.s, z1.s[1]\n" + "fmla z30.s, z25.s, z2.s[1]\n" + "fmla z31.s, z25.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z28.s, z24.s, z1.s[0]\n" + "fmla z28.s, z25.s, z1.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 23: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z19.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z20.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z21.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z22.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z23.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z24.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z25.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z26.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z24.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z24.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z24.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z24.s, z3.s[0]\n" + "fmla z28.s, z25.s, z0.s[1]\n" + "fmla z29.s, z25.s, z1.s[1]\n" + "fmla z30.s, z25.s, z2.s[1]\n" + "fmla z31.s, z25.s, z3.s[1]\n" + "fmla z28.s, z26.s, z0.s[2]\n" + "fmla z29.s, z26.s, z1.s[2]\n" + "fmla z30.s, z26.s, z2.s[2]\n" + "fmla z31.s, z26.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z28.s, z24.s, z1.s[0]\n" + "fmla z28.s, z25.s, z1.s[1]\n" + "fmla z28.s, z26.s, z1.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + default: + case 24: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z19.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z20.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z21.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z22.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z23.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z24.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z25.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z26.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z27.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x50]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x50]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z24.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z24.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z24.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z24.s, z3.s[0]\n" + "fmla z28.s, z25.s, z0.s[1]\n" + "fmla z29.s, z25.s, z1.s[1]\n" + "fmla z30.s, z25.s, z2.s[1]\n" + "fmla z31.s, z25.s, z3.s[1]\n" + "fmla z28.s, z26.s, z0.s[2]\n" + "fmla z29.s, z26.s, z1.s[2]\n" + "fmla z30.s, z26.s, z2.s[2]\n" + "fmla z31.s, z26.s, z3.s[2]\n" + "fmla z28.s, z27.s, z0.s[3]\n" + "fmla z29.s, z27.s, z1.s[3]\n" + "fmla z30.s, z27.s, z2.s[3]\n" + "fmla z31.s, z27.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z28.s, z24.s, z1.s[0]\n" + "fmla z28.s, z25.s, z1.s[1]\n" + "fmla z28.s, z26.s, z1.s[2]\n" + "fmla z28.s, z27.s, z1.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + } + } +} + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp new file mode 100644 index 0000000000..aa2c522382 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __ARM_FEATURE_SVE + + + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_smallK_hybrid_fp32_mla_1VLx4(const float *, int, const float *, float *, int, float, int, int, int); + +class smallK_hybrid_fp32_mla_1VLx4 +{ +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, int, const float *, float *, int, float, int, int, int); + + /* Kernel blocking parameters */ + static int out_height() + { + return 4; + } + + static int out_width() + { + return get_vector_length() * 1; + } + + static int k_unroll() + { + return 1; + } + + StdTransformsSVE transforms = {}; + + // Default to the generic kernel + kern_type kernel=sve_smallK_hybrid_fp32_mla_1VLx4; + + smallK_hybrid_fp32_mla_1VLx4(const CPUInfo *ci) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp new file mode 100644 index 0000000000..3e7e713106 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp @@ -0,0 +1,4004 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include + + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void sve_smallK_hybrid_fp32_mla_1VLx4(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) { + const long beta0 = (beta == 0.0f); + + const long loops_count = M / 4; + const long oddrow_count = M % 4; + const long ldab = lda * sizeof(float); + const long ldcb = ldc * sizeof(float); + const int K_stride = K; + const long odd_depth = K % 4; + const float *betaptr = β + + for (int x0=0; x0() * 1)) { + const long width = std::min((unsigned long)N-x0, (get_vector_length() * 1)); + long loops = loops_count; + long oddrows = oddrow_count; + long temp = 0; + const float *b_ptr0 = B + (K_stride * x0); + + const float *a_ptr0 = A; + + float *c_ptr0 = C + x0; + + switch(K) { + case 1: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "ld1rqw z1.s, p6/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "ld1rqw z2.s, p6/z, [a_ptr2]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "ld1rqw z3.s, p6/z, [a_ptr3]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "ld1rqw z1.s, p6/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "ld1rqw z2.s, p6/z, [a_ptr2]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "ld1rqw z3.s, p6/z, [a_ptr3]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "ld1rqw z1.s, p6/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "ld1rqw z2.s, p6/z, [a_ptr2]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "ld1rqw z3.s, p6/z, [a_ptr3]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 5: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 6: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 7: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 8: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 9: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 10: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 11: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 12: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 13: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 14: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 15: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 16: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 17: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 18: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0]]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 19: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0]]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 20: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0]]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 21: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0]]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z24.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z24.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z24.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z24.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z28.s, z24.s, z1.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 22: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0]]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z25.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z24.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z24.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z24.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z24.s, z3.s[0]\n" + "fmla z28.s, z25.s, z0.s[1]\n" + "fmla z29.s, z25.s, z1.s[1]\n" + "fmla z30.s, z25.s, z2.s[1]\n" + "fmla z31.s, z25.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z28.s, z24.s, z1.s[0]\n" + "fmla z28.s, z25.s, z1.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 23: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0]]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z25.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z26.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z24.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z24.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z24.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z24.s, z3.s[0]\n" + "fmla z28.s, z25.s, z0.s[1]\n" + "fmla z29.s, z25.s, z1.s[1]\n" + "fmla z30.s, z25.s, z2.s[1]\n" + "fmla z31.s, z25.s, z3.s[1]\n" + "fmla z28.s, z26.s, z0.s[2]\n" + "fmla z29.s, z26.s, z1.s[2]\n" + "fmla z30.s, z26.s, z2.s[2]\n" + "fmla z31.s, z26.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z28.s, z24.s, z1.s[0]\n" + "fmla z28.s, z25.s, z1.s[1]\n" + "fmla z28.s, z26.s, z1.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + default: + case 24: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0]]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z25.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z26.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z27.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x50]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x50]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z24.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z24.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z24.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z24.s, z3.s[0]\n" + "fmla z28.s, z25.s, z0.s[1]\n" + "fmla z29.s, z25.s, z1.s[1]\n" + "fmla z30.s, z25.s, z2.s[1]\n" + "fmla z31.s, z25.s, z3.s[1]\n" + "fmla z28.s, z26.s, z0.s[2]\n" + "fmla z29.s, z26.s, z1.s[2]\n" + "fmla z30.s, z26.s, z2.s[2]\n" + "fmla z31.s, z26.s, z3.s[2]\n" + "fmla z28.s, z27.s, z0.s[3]\n" + "fmla z29.s, z27.s, z1.s[3]\n" + "fmla z30.s, z27.s, z2.s[3]\n" + "fmla z31.s, z27.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z28.s, z24.s, z1.s[0]\n" + "fmla z28.s, z25.s, z1.s[1]\n" + "fmla z28.s, z26.s, z1.s[2]\n" + "fmla z28.s, z27.s, z1.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + } + } +} + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp new file mode 100644 index 0000000000..fcdca59bdd --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp @@ -0,0 +1,1660 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +template<> +inline void MergeResults<12, 8, false>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta) +{ + const float *inptr = in; + + for (int y=y0; y= xmax) + { + for (int xi=0; xi<12; xi++) + { + if ((i+xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]); + outptr0++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + __asm __volatile ( + "ldr q4, [%[inptr]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x180]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0]]\n" + "ldr q5, [%[inptr], #0x10]\n" + "prfm PSTL1KEEP, [%[outptr0], #0x60]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr0], #0x10]\n" + "ldr q6, [%[inptr], #0x20]\n" + "add %[inptr], %[inptr], #0x180\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr0], #0x20]\n" + "add %[outptr0], %[outptr0], #0x30\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), + [inptr] "+r" (inptr) + : [alpha] "w" (alpha), [beta] "w" (beta) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory" + ); + } + } + break; + + case 2: + { + if ((i+11) >= xmax) + { + for (int xi=0; xi<12; xi++) + { + if ((i+xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]); + outptr0++; + *outptr1 = (alpha * inptr[xi + 12]); + outptr1++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + __asm __volatile ( + "ldr q4, [%[inptr]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x180]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0]]\n" + "ldr q5, [%[inptr], #0x30]\n" + "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1]]\n" + "ldr q6, [%[inptr], #0x10]\n" + "prfm PSTL1KEEP, [%[outptr0], #0x60]\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr0], #0x10]\n" + "ldr q7, [%[inptr], #0x40]\n" + "prfm PSTL1KEEP, [%[outptr1], #0x60]\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr1], #0x10]\n" + "ldr q4, [%[inptr], #0x20]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0], #0x20]\n" + "ldr q5, [%[inptr], #0x50]\n" + "add %[outptr0], %[outptr0], #0x30\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1], #0x20]\n" + "add %[outptr1], %[outptr1], #0x30\n" + "add %[inptr], %[inptr], #0x180\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), + [inptr] "+r" (inptr) + : [alpha] "w" (alpha), [beta] "w" (beta) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory" + ); + } + } + break; + + case 3: + { + if ((i+11) >= xmax) + { + for (int xi=0; xi<12; xi++) + { + if ((i+xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]); + outptr0++; + *outptr1 = (alpha * inptr[xi + 12]); + outptr1++; + *outptr2 = (alpha * inptr[xi + 24]); + outptr2++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + __asm __volatile ( + "ldr q4, [%[inptr]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x180]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0]]\n" + "ldr q5, [%[inptr], #0x30]\n" + "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1]]\n" + "ldr q6, [%[inptr], #0x60]\n" + "prfm PSTL1KEEP, [%[outptr0], #0x60]\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2]]\n" + "ldr q7, [%[inptr], #0x10]\n" + "prfm PSTL1KEEP, [%[outptr1], #0x60]\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr0], #0x10]\n" + "ldr q4, [%[inptr], #0x40]\n" + "prfm PLDL1KEEP, [%[inptr], #0x200]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr1], #0x10]\n" + "ldr q5, [%[inptr], #0x70]\n" + "prfm PSTL1KEEP, [%[outptr2], #0x60]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr2], #0x10]\n" + "ldr q6, [%[inptr], #0x20]\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr0], #0x20]\n" + "ldr q7, [%[inptr], #0x50]\n" + "add %[outptr0], %[outptr0], #0x30\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr1], #0x20]\n" + "ldr q4, [%[inptr], #0x80]\n" + "add %[outptr1], %[outptr1], #0x30\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr2], #0x20]\n" + "add %[outptr2], %[outptr2], #0x30\n" + "add %[inptr], %[inptr], #0x180\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), + [inptr] "+r" (inptr) + : [alpha] "w" (alpha), [beta] "w" (beta) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory" + ); + } + } + break; + + case 4: + { + if ((i+11) >= xmax) + { + for (int xi=0; xi<12; xi++) + { + if ((i+xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]); + outptr0++; + *outptr1 = (alpha * inptr[xi + 12]); + outptr1++; + *outptr2 = (alpha * inptr[xi + 24]); + outptr2++; + *outptr3 = (alpha * inptr[xi + 36]); + outptr3++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + __asm __volatile ( + "ldr q4, [%[inptr]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x180]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0]]\n" + "ldr q5, [%[inptr], #0x30]\n" + "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1]]\n" + "ldr q6, [%[inptr], #0x60]\n" + "prfm PSTL1KEEP, [%[outptr0], #0x60]\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2]]\n" + "ldr q7, [%[inptr], #0x90]\n" + "prfm PSTL1KEEP, [%[outptr1], #0x60]\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3]]\n" + "ldr q4, [%[inptr], #0x10]\n" + "prfm PLDL1KEEP, [%[inptr], #0x200]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0], #0x10]\n" + "ldr q5, [%[inptr], #0x40]\n" + "prfm PSTL1KEEP, [%[outptr2], #0x60]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1], #0x10]\n" + "ldr q6, [%[inptr], #0x70]\n" + "prfm PSTL1KEEP, [%[outptr3], #0x60]\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2], #0x10]\n" + "ldr q7, [%[inptr], #0xa0]\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3], #0x10]\n" + "ldr q4, [%[inptr], #0x20]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0], #0x20]\n" + "ldr q5, [%[inptr], #0x50]\n" + "add %[outptr0], %[outptr0], #0x30\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1], #0x20]\n" + "ldr q6, [%[inptr], #0x80]\n" + "add %[outptr1], %[outptr1], #0x30\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2], #0x20]\n" + "ldr q7, [%[inptr], #0xb0]\n" + "add %[outptr2], %[outptr2], #0x30\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3], #0x20]\n" + "add %[outptr3], %[outptr3], #0x30\n" + "add %[inptr], %[inptr], #0x180\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), + [inptr] "+r" (inptr) + : [alpha] "w" (alpha), [beta] "w" (beta) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory" + ); + } + } + break; + + case 5: + { + if ((i+11) >= xmax) + { + for (int xi=0; xi<12; xi++) + { + if ((i+xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]); + outptr0++; + *outptr1 = (alpha * inptr[xi + 12]); + outptr1++; + *outptr2 = (alpha * inptr[xi + 24]); + outptr2++; + *outptr3 = (alpha * inptr[xi + 36]); + outptr3++; + *outptr4 = (alpha * inptr[xi + 48]); + outptr4++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + __asm __volatile ( + "ldr q4, [%[inptr]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x180]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0]]\n" + "ldr q5, [%[inptr], #0x30]\n" + "prfm PLDL1KEEP, [%[inptr], #0x240]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1]]\n" + "ldr q6, [%[inptr], #0x60]\n" + "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2]]\n" + "ldr q7, [%[inptr], #0x90]\n" + "prfm PSTL1KEEP, [%[outptr0], #0x60]\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3]]\n" + "ldr q4, [%[inptr], #0xc0]\n" + "prfm PSTL1KEEP, [%[outptr1], #0x60]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr4]]\n" + "ldr q5, [%[inptr], #0x10]\n" + "prfm PLDL1KEEP, [%[inptr], #0x200]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr0], #0x10]\n" + "ldr q6, [%[inptr], #0x40]\n" + "prfm PSTL1KEEP, [%[outptr2], #0x60]\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr1], #0x10]\n" + "ldr q7, [%[inptr], #0x70]\n" + "prfm PSTL1KEEP, [%[outptr3], #0x60]\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr2], #0x10]\n" + "ldr q4, [%[inptr], #0xa0]\n" + "prfm PSTL1KEEP, [%[outptr4], #0x60]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr3], #0x10]\n" + "ldr q5, [%[inptr], #0xd0]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr4], #0x10]\n" + "ldr q6, [%[inptr], #0x20]\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr0], #0x20]\n" + "ldr q7, [%[inptr], #0x50]\n" + "add %[outptr0], %[outptr0], #0x30\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr1], #0x20]\n" + "ldr q4, [%[inptr], #0x80]\n" + "add %[outptr1], %[outptr1], #0x30\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr2], #0x20]\n" + "ldr q5, [%[inptr], #0xb0]\n" + "add %[outptr2], %[outptr2], #0x30\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr3], #0x20]\n" + "ldr q6, [%[inptr], #0xe0]\n" + "add %[outptr3], %[outptr3], #0x30\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr4], #0x20]\n" + "add %[outptr4], %[outptr4], #0x30\n" + "add %[inptr], %[inptr], #0x180\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), + [inptr] "+r" (inptr) + : [alpha] "w" (alpha), [beta] "w" (beta) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory" + ); + } + } + break; + + case 6: + { + if ((i+11) >= xmax) + { + for (int xi=0; xi<12; xi++) + { + if ((i+xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]); + outptr0++; + *outptr1 = (alpha * inptr[xi + 12]); + outptr1++; + *outptr2 = (alpha * inptr[xi + 24]); + outptr2++; + *outptr3 = (alpha * inptr[xi + 36]); + outptr3++; + *outptr4 = (alpha * inptr[xi + 48]); + outptr4++; + *outptr5 = (alpha * inptr[xi + 60]); + outptr5++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + __asm __volatile ( + "ldr q4, [%[inptr]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x180]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0]]\n" + "ldr q5, [%[inptr], #0x30]\n" + "prfm PLDL1KEEP, [%[inptr], #0x240]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1]]\n" + "ldr q6, [%[inptr], #0x60]\n" + "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2]]\n" + "ldr q7, [%[inptr], #0x90]\n" + "prfm PLDL1KEEP, [%[inptr], #0x280]\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3]]\n" + "ldr q4, [%[inptr], #0xc0]\n" + "prfm PSTL1KEEP, [%[outptr0], #0x60]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr4]]\n" + "ldr q5, [%[inptr], #0xf0]\n" + "prfm PSTL1KEEP, [%[outptr1], #0x60]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr5]]\n" + "ldr q6, [%[inptr], #0x10]\n" + "prfm PLDL1KEEP, [%[inptr], #0x200]\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr0], #0x10]\n" + "ldr q7, [%[inptr], #0x40]\n" + "prfm PSTL1KEEP, [%[outptr2], #0x60]\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr1], #0x10]\n" + "ldr q4, [%[inptr], #0x70]\n" + "prfm PSTL1KEEP, [%[outptr3], #0x60]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr2], #0x10]\n" + "ldr q5, [%[inptr], #0xa0]\n" + "prfm PSTL1KEEP, [%[outptr4], #0x60]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr3], #0x10]\n" + "ldr q6, [%[inptr], #0xd0]\n" + "prfm PSTL1KEEP, [%[outptr5], #0x60]\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr4], #0x10]\n" + "ldr q7, [%[inptr], #0x100]\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr5], #0x10]\n" + "ldr q4, [%[inptr], #0x20]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0], #0x20]\n" + "ldr q5, [%[inptr], #0x50]\n" + "add %[outptr0], %[outptr0], #0x30\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1], #0x20]\n" + "ldr q6, [%[inptr], #0x80]\n" + "add %[outptr1], %[outptr1], #0x30\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2], #0x20]\n" + "ldr q7, [%[inptr], #0xb0]\n" + "add %[outptr2], %[outptr2], #0x30\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3], #0x20]\n" + "ldr q4, [%[inptr], #0xe0]\n" + "add %[outptr3], %[outptr3], #0x30\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr4], #0x20]\n" + "ldr q5, [%[inptr], #0x110]\n" + "add %[outptr4], %[outptr4], #0x30\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr5], #0x20]\n" + "add %[outptr5], %[outptr5], #0x30\n" + "add %[inptr], %[inptr], #0x180\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), + [inptr] "+r" (inptr) + : [alpha] "w" (alpha), [beta] "w" (beta) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory" + ); + } + } + break; + + case 7: + { + if ((i+11) >= xmax) + { + for (int xi=0; xi<12; xi++) + { + if ((i+xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]); + outptr0++; + *outptr1 = (alpha * inptr[xi + 12]); + outptr1++; + *outptr2 = (alpha * inptr[xi + 24]); + outptr2++; + *outptr3 = (alpha * inptr[xi + 36]); + outptr3++; + *outptr4 = (alpha * inptr[xi + 48]); + outptr4++; + *outptr5 = (alpha * inptr[xi + 60]); + outptr5++; + *outptr6 = (alpha * inptr[xi + 72]); + outptr6++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + __asm __volatile ( + "ldr q4, [%[inptr]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x180]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0]]\n" + "ldr q5, [%[inptr], #0x30]\n" + "prfm PLDL1KEEP, [%[inptr], #0x240]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1]]\n" + "ldr q6, [%[inptr], #0x60]\n" + "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2]]\n" + "ldr q7, [%[inptr], #0x90]\n" + "prfm PLDL1KEEP, [%[inptr], #0x280]\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3]]\n" + "ldr q4, [%[inptr], #0xc0]\n" + "prfm PSTL1KEEP, [%[outptr0], #0x60]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr4]]\n" + "ldr q5, [%[inptr], #0xf0]\n" + "prfm PSTL1KEEP, [%[outptr1], #0x60]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr5]]\n" + "ldr q6, [%[inptr], #0x120]\n" + "prfm PLDL1KEEP, [%[inptr], #0x200]\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr6]]\n" + "ldr q7, [%[inptr], #0x10]\n" + "prfm PSTL1KEEP, [%[outptr2], #0x60]\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr0], #0x10]\n" + "ldr q4, [%[inptr], #0x40]\n" + "prfm PSTL1KEEP, [%[outptr3], #0x60]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr1], #0x10]\n" + "ldr q5, [%[inptr], #0x70]\n" + "prfm PSTL1KEEP, [%[outptr4], #0x60]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr2], #0x10]\n" + "ldr q6, [%[inptr], #0xa0]\n" + "prfm PSTL1KEEP, [%[outptr5], #0x60]\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr3], #0x10]\n" + "ldr q7, [%[inptr], #0xd0]\n" + "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr4], #0x10]\n" + "ldr q4, [%[inptr], #0x100]\n" + "prfm PSTL1KEEP, [%[outptr6], #0x60]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr5], #0x10]\n" + "ldr q5, [%[inptr], #0x130]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr6], #0x10]\n" + "ldr q6, [%[inptr], #0x20]\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr0], #0x20]\n" + "ldr q7, [%[inptr], #0x50]\n" + "add %[outptr0], %[outptr0], #0x30\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr1], #0x20]\n" + "ldr q4, [%[inptr], #0x80]\n" + "add %[outptr1], %[outptr1], #0x30\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr2], #0x20]\n" + "ldr q5, [%[inptr], #0xb0]\n" + "add %[outptr2], %[outptr2], #0x30\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr3], #0x20]\n" + "ldr q6, [%[inptr], #0xe0]\n" + "add %[outptr3], %[outptr3], #0x30\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr4], #0x20]\n" + "ldr q7, [%[inptr], #0x110]\n" + "add %[outptr4], %[outptr4], #0x30\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr5], #0x20]\n" + "ldr q4, [%[inptr], #0x140]\n" + "add %[outptr5], %[outptr5], #0x30\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr6], #0x20]\n" + "add %[outptr6], %[outptr6], #0x30\n" + "add %[inptr], %[inptr], #0x180\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), + [inptr] "+r" (inptr) + : [alpha] "w" (alpha), [beta] "w" (beta) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory" + ); + } + } + break; + + default: + case 8: + { + if ((i+11) >= xmax) + { + for (int xi=0; xi<12; xi++) + { + if ((i+xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]); + outptr0++; + *outptr1 = (alpha * inptr[xi + 12]); + outptr1++; + *outptr2 = (alpha * inptr[xi + 24]); + outptr2++; + *outptr3 = (alpha * inptr[xi + 36]); + outptr3++; + *outptr4 = (alpha * inptr[xi + 48]); + outptr4++; + *outptr5 = (alpha * inptr[xi + 60]); + outptr5++; + *outptr6 = (alpha * inptr[xi + 72]); + outptr6++; + *outptr7 = (alpha * inptr[xi + 84]); + outptr7++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + __asm __volatile ( + "ldr q4, [%[inptr]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x180]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0]]\n" + "ldr q5, [%[inptr], #0x30]\n" + "prfm PLDL1KEEP, [%[inptr], #0x240]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1]]\n" + "ldr q6, [%[inptr], #0x60]\n" + "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2]]\n" + "ldr q7, [%[inptr], #0x90]\n" + "prfm PLDL1KEEP, [%[inptr], #0x280]\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3]]\n" + "ldr q4, [%[inptr], #0xc0]\n" + "prfm PSTL1KEEP, [%[outptr0], #0x60]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr4]]\n" + "ldr q5, [%[inptr], #0xf0]\n" + "prfm PSTL1KEEP, [%[outptr1], #0x60]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr5]]\n" + "ldr q6, [%[inptr], #0x120]\n" + "prfm PLDL1KEEP, [%[inptr], #0x200]\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr6]]\n" + "ldr q7, [%[inptr], #0x150]\n" + "prfm PSTL1KEEP, [%[outptr2], #0x60]\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr7]]\n" + "ldr q4, [%[inptr], #0x10]\n" + "prfm PSTL1KEEP, [%[outptr3], #0x60]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0], #0x10]\n" + "ldr q5, [%[inptr], #0x40]\n" + "prfm PSTL1KEEP, [%[outptr4], #0x60]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1], #0x10]\n" + "ldr q6, [%[inptr], #0x70]\n" + "prfm PSTL1KEEP, [%[outptr5], #0x60]\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2], #0x10]\n" + "ldr q7, [%[inptr], #0xa0]\n" + "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3], #0x10]\n" + "ldr q4, [%[inptr], #0xd0]\n" + "prfm PSTL1KEEP, [%[outptr6], #0x60]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr4], #0x10]\n" + "ldr q5, [%[inptr], #0x100]\n" + "prfm PSTL1KEEP, [%[outptr7], #0x60]\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr5], #0x10]\n" + "ldr q6, [%[inptr], #0x130]\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr6], #0x10]\n" + "ldr q7, [%[inptr], #0x160]\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr7], #0x10]\n" + "ldr q4, [%[inptr], #0x20]\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0], #0x20]\n" + "ldr q5, [%[inptr], #0x50]\n" + "add %[outptr0], %[outptr0], #0x30\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1], #0x20]\n" + "ldr q6, [%[inptr], #0x80]\n" + "add %[outptr1], %[outptr1], #0x30\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2], #0x20]\n" + "ldr q7, [%[inptr], #0xb0]\n" + "add %[outptr2], %[outptr2], #0x30\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3], #0x20]\n" + "ldr q4, [%[inptr], #0xe0]\n" + "add %[outptr3], %[outptr3], #0x30\n" + "fmul v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr4], #0x20]\n" + "ldr q5, [%[inptr], #0x110]\n" + "add %[outptr4], %[outptr4], #0x30\n" + "fmul v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr5], #0x20]\n" + "ldr q6, [%[inptr], #0x140]\n" + "add %[outptr5], %[outptr5], #0x30\n" + "fmul v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr6], #0x20]\n" + "ldr q7, [%[inptr], #0x170]\n" + "add %[outptr6], %[outptr6], #0x30\n" + "fmul v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr7], #0x20]\n" + "add %[outptr7], %[outptr7], #0x30\n" + "add %[inptr], %[inptr], #0x180\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), + [inptr] "+r" (inptr) + : [alpha] "w" (alpha), [beta] "w" (beta) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory" + ); + } + } + break; + + + } + } + else + { + switch(height) { + case 1: + { + if ((i+11) >= xmax) + { + for (int xi=0; xi<12; xi++) + { + if ((i+xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); + outptr0++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + __asm __volatile ( + "ldr q8, [%[outptr0]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x180]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr]]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0]]\n" + "ldr q9, [%[outptr0], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr0], #0x60]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x10]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr0], #0x10]\n" + "ldr q10, [%[outptr0], #0x20]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x20]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr0], #0x20]\n" + "add %[outptr0], %[outptr0], #0x30\n" + "add %[inptr], %[inptr], #0x180\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), + [inptr] "+r" (inptr) + : [alpha] "w" (alpha), [beta] "w" (beta) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory" + ); + } + } + break; + + case 2: + { + if ((i+11) >= xmax) + { + for (int xi=0; xi<12; xi++) + { + if ((i+xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); + outptr0++; + *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta); + outptr1++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + __asm __volatile ( + "ldr q8, [%[outptr0]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x180]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr]]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0]]\n" + "ldr q9, [%[outptr1]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x30]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1]]\n" + "ldr q10, [%[outptr0], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr0], #0x60]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x10]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr0], #0x10]\n" + "ldr q11, [%[outptr1], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr1], #0x60]\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0x40]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr1], #0x10]\n" + "ldr q8, [%[outptr0], #0x20]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0x20]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0], #0x20]\n" + "ldr q9, [%[outptr1], #0x20]\n" + "add %[outptr0], %[outptr0], #0x30\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x50]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1], #0x20]\n" + "add %[outptr1], %[outptr1], #0x30\n" + "add %[inptr], %[inptr], #0x180\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), + [inptr] "+r" (inptr) + : [alpha] "w" (alpha), [beta] "w" (beta) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory" + ); + } + } + break; + + case 3: + { + if ((i+11) >= xmax) + { + for (int xi=0; xi<12; xi++) + { + if ((i+xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); + outptr0++; + *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta); + outptr1++; + *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta); + outptr2++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + __asm __volatile ( + "ldr q8, [%[outptr0]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x180]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr]]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0]]\n" + "ldr q9, [%[outptr1]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x30]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1]]\n" + "ldr q10, [%[outptr2]]\n" + "prfm PLDL1KEEP, [%[outptr0], #0x60]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x60]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2]]\n" + "ldr q11, [%[outptr0], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr1], #0x60]\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0x10]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr0], #0x10]\n" + "ldr q8, [%[outptr1], #0x10]\n" + "prfm PLDL1KEEP, [%[inptr], #0x200]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0x40]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr1], #0x10]\n" + "ldr q9, [%[outptr2], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr2], #0x60]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x70]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr2], #0x10]\n" + "ldr q10, [%[outptr0], #0x20]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x20]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr0], #0x20]\n" + "ldr q11, [%[outptr1], #0x20]\n" + "add %[outptr0], %[outptr0], #0x30\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0x50]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr1], #0x20]\n" + "ldr q8, [%[outptr2], #0x20]\n" + "add %[outptr1], %[outptr1], #0x30\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0x80]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr2], #0x20]\n" + "add %[outptr2], %[outptr2], #0x30\n" + "add %[inptr], %[inptr], #0x180\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), + [inptr] "+r" (inptr) + : [alpha] "w" (alpha), [beta] "w" (beta) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory" + ); + } + } + break; + + case 4: + { + if ((i+11) >= xmax) + { + for (int xi=0; xi<12; xi++) + { + if ((i+xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); + outptr0++; + *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta); + outptr1++; + *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta); + outptr2++; + *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta); + outptr3++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + __asm __volatile ( + "ldr q8, [%[outptr0]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x180]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr]]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0]]\n" + "ldr q9, [%[outptr1]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x30]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1]]\n" + "ldr q10, [%[outptr2]]\n" + "prfm PLDL1KEEP, [%[outptr0], #0x60]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x60]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2]]\n" + "ldr q11, [%[outptr3]]\n" + "prfm PLDL1KEEP, [%[outptr1], #0x60]\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0x90]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3]]\n" + "ldr q8, [%[outptr0], #0x10]\n" + "prfm PLDL1KEEP, [%[inptr], #0x200]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0x10]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0], #0x10]\n" + "ldr q9, [%[outptr1], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr2], #0x60]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x40]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1], #0x10]\n" + "ldr q10, [%[outptr2], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr3], #0x60]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x70]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2], #0x10]\n" + "ldr q11, [%[outptr3], #0x10]\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0xa0]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3], #0x10]\n" + "ldr q8, [%[outptr0], #0x20]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0x20]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0], #0x20]\n" + "ldr q9, [%[outptr1], #0x20]\n" + "add %[outptr0], %[outptr0], #0x30\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x50]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1], #0x20]\n" + "ldr q10, [%[outptr2], #0x20]\n" + "add %[outptr1], %[outptr1], #0x30\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x80]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2], #0x20]\n" + "ldr q11, [%[outptr3], #0x20]\n" + "add %[outptr2], %[outptr2], #0x30\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0xb0]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3], #0x20]\n" + "add %[outptr3], %[outptr3], #0x30\n" + "add %[inptr], %[inptr], #0x180\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), + [inptr] "+r" (inptr) + : [alpha] "w" (alpha), [beta] "w" (beta) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory" + ); + } + } + break; + + case 5: + { + if ((i+11) >= xmax) + { + for (int xi=0; xi<12; xi++) + { + if ((i+xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); + outptr0++; + *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta); + outptr1++; + *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta); + outptr2++; + *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta); + outptr3++; + *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta); + outptr4++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + __asm __volatile ( + "ldr q8, [%[outptr0]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x180]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr]]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0]]\n" + "ldr q9, [%[outptr1]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x240]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x30]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1]]\n" + "ldr q10, [%[outptr2]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x60]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2]]\n" + "ldr q11, [%[outptr3]]\n" + "prfm PLDL1KEEP, [%[outptr0], #0x60]\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0x90]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3]]\n" + "ldr q8, [%[outptr4]]\n" + "prfm PLDL1KEEP, [%[outptr1], #0x60]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0xc0]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr4]]\n" + "ldr q9, [%[outptr0], #0x10]\n" + "prfm PLDL1KEEP, [%[inptr], #0x200]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x10]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr0], #0x10]\n" + "ldr q10, [%[outptr1], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr2], #0x60]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x40]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr1], #0x10]\n" + "ldr q11, [%[outptr2], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr3], #0x60]\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0x70]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr2], #0x10]\n" + "ldr q8, [%[outptr3], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr4], #0x60]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0xa0]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr3], #0x10]\n" + "ldr q9, [%[outptr4], #0x10]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0xd0]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr4], #0x10]\n" + "ldr q10, [%[outptr0], #0x20]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x20]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr0], #0x20]\n" + "ldr q11, [%[outptr1], #0x20]\n" + "add %[outptr0], %[outptr0], #0x30\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0x50]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr1], #0x20]\n" + "ldr q8, [%[outptr2], #0x20]\n" + "add %[outptr1], %[outptr1], #0x30\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0x80]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr2], #0x20]\n" + "ldr q9, [%[outptr3], #0x20]\n" + "add %[outptr2], %[outptr2], #0x30\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0xb0]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr3], #0x20]\n" + "ldr q10, [%[outptr4], #0x20]\n" + "add %[outptr3], %[outptr3], #0x30\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0xe0]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr4], #0x20]\n" + "add %[outptr4], %[outptr4], #0x30\n" + "add %[inptr], %[inptr], #0x180\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), + [inptr] "+r" (inptr) + : [alpha] "w" (alpha), [beta] "w" (beta) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory" + ); + } + } + break; + + case 6: + { + if ((i+11) >= xmax) + { + for (int xi=0; xi<12; xi++) + { + if ((i+xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); + outptr0++; + *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta); + outptr1++; + *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta); + outptr2++; + *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta); + outptr3++; + *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta); + outptr4++; + *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta); + outptr5++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + __asm __volatile ( + "ldr q8, [%[outptr0]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x180]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr]]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0]]\n" + "ldr q9, [%[outptr1]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x240]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x30]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1]]\n" + "ldr q10, [%[outptr2]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x60]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2]]\n" + "ldr q11, [%[outptr3]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x280]\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0x90]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3]]\n" + "ldr q8, [%[outptr4]]\n" + "prfm PLDL1KEEP, [%[outptr0], #0x60]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0xc0]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr4]]\n" + "ldr q9, [%[outptr5]]\n" + "prfm PLDL1KEEP, [%[outptr1], #0x60]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0xf0]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr5]]\n" + "ldr q10, [%[outptr0], #0x10]\n" + "prfm PLDL1KEEP, [%[inptr], #0x200]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x10]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr0], #0x10]\n" + "ldr q11, [%[outptr1], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr2], #0x60]\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0x40]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr1], #0x10]\n" + "ldr q8, [%[outptr2], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr3], #0x60]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0x70]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr2], #0x10]\n" + "ldr q9, [%[outptr3], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr4], #0x60]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0xa0]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr3], #0x10]\n" + "ldr q10, [%[outptr4], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr5], #0x60]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0xd0]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr4], #0x10]\n" + "ldr q11, [%[outptr5], #0x10]\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0x100]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr5], #0x10]\n" + "ldr q8, [%[outptr0], #0x20]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0x20]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0], #0x20]\n" + "ldr q9, [%[outptr1], #0x20]\n" + "add %[outptr0], %[outptr0], #0x30\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x50]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1], #0x20]\n" + "ldr q10, [%[outptr2], #0x20]\n" + "add %[outptr1], %[outptr1], #0x30\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x80]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2], #0x20]\n" + "ldr q11, [%[outptr3], #0x20]\n" + "add %[outptr2], %[outptr2], #0x30\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0xb0]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3], #0x20]\n" + "ldr q8, [%[outptr4], #0x20]\n" + "add %[outptr3], %[outptr3], #0x30\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0xe0]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr4], #0x20]\n" + "ldr q9, [%[outptr5], #0x20]\n" + "add %[outptr4], %[outptr4], #0x30\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x110]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr5], #0x20]\n" + "add %[outptr5], %[outptr5], #0x30\n" + "add %[inptr], %[inptr], #0x180\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), + [inptr] "+r" (inptr) + : [alpha] "w" (alpha), [beta] "w" (beta) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory" + ); + } + } + break; + + case 7: + { + if ((i+11) >= xmax) + { + for (int xi=0; xi<12; xi++) + { + if ((i+xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); + outptr0++; + *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta); + outptr1++; + *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta); + outptr2++; + *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta); + outptr3++; + *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta); + outptr4++; + *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta); + outptr5++; + *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta); + outptr6++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + __asm __volatile ( + "ldr q8, [%[outptr0]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x180]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr]]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0]]\n" + "ldr q9, [%[outptr1]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x240]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x30]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1]]\n" + "ldr q10, [%[outptr2]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x60]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2]]\n" + "ldr q11, [%[outptr3]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x280]\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0x90]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3]]\n" + "ldr q8, [%[outptr4]]\n" + "prfm PLDL1KEEP, [%[outptr0], #0x60]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0xc0]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr4]]\n" + "ldr q9, [%[outptr5]]\n" + "prfm PLDL1KEEP, [%[outptr1], #0x60]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0xf0]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr5]]\n" + "ldr q10, [%[outptr6]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x200]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x120]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr6]]\n" + "ldr q11, [%[outptr0], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr2], #0x60]\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0x10]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr0], #0x10]\n" + "ldr q8, [%[outptr1], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr3], #0x60]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0x40]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr1], #0x10]\n" + "ldr q9, [%[outptr2], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr4], #0x60]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x70]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr2], #0x10]\n" + "ldr q10, [%[outptr3], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr5], #0x60]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0xa0]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr3], #0x10]\n" + "ldr q11, [%[outptr4], #0x10]\n" + "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0xd0]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr4], #0x10]\n" + "ldr q8, [%[outptr5], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr6], #0x60]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0x100]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr5], #0x10]\n" + "ldr q9, [%[outptr6], #0x10]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x130]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr6], #0x10]\n" + "ldr q10, [%[outptr0], #0x20]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x20]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr0], #0x20]\n" + "ldr q11, [%[outptr1], #0x20]\n" + "add %[outptr0], %[outptr0], #0x30\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0x50]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr1], #0x20]\n" + "ldr q8, [%[outptr2], #0x20]\n" + "add %[outptr1], %[outptr1], #0x30\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0x80]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr2], #0x20]\n" + "ldr q9, [%[outptr3], #0x20]\n" + "add %[outptr2], %[outptr2], #0x30\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0xb0]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr3], #0x20]\n" + "ldr q10, [%[outptr4], #0x20]\n" + "add %[outptr3], %[outptr3], #0x30\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0xe0]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr4], #0x20]\n" + "ldr q11, [%[outptr5], #0x20]\n" + "add %[outptr4], %[outptr4], #0x30\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0x110]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr5], #0x20]\n" + "ldr q8, [%[outptr6], #0x20]\n" + "add %[outptr5], %[outptr5], #0x30\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0x140]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr6], #0x20]\n" + "add %[outptr6], %[outptr6], #0x30\n" + "add %[inptr], %[inptr], #0x180\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), + [inptr] "+r" (inptr) + : [alpha] "w" (alpha), [beta] "w" (beta) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory" + ); + } + } + break; + + default: + case 8: + { + if ((i+11) >= xmax) + { + for (int xi=0; xi<12; xi++) + { + if ((i+xi) < xmax) + { + *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta); + outptr0++; + *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta); + outptr1++; + *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta); + outptr2++; + *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta); + outptr3++; + *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta); + outptr4++; + *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta); + outptr5++; + *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta); + outptr6++; + *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta); + outptr7++; + } + } + inptr += 96; + } else { + /* Optimized routine to copy an entire block */ + __asm __volatile ( + "ldr q8, [%[outptr0]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x180]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr]]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0]]\n" + "ldr q9, [%[outptr1]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x240]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x30]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1]]\n" + "ldr q10, [%[outptr2]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x60]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2]]\n" + "ldr q11, [%[outptr3]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x280]\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0x90]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3]]\n" + "ldr q8, [%[outptr4]]\n" + "prfm PLDL1KEEP, [%[outptr0], #0x60]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0xc0]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr4]]\n" + "ldr q9, [%[outptr5]]\n" + "prfm PLDL1KEEP, [%[outptr1], #0x60]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0xf0]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr5]]\n" + "ldr q10, [%[outptr6]]\n" + "prfm PLDL1KEEP, [%[inptr], #0x200]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x120]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr6]]\n" + "ldr q11, [%[outptr7]]\n" + "prfm PLDL1KEEP, [%[outptr2], #0x60]\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0x150]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr7]]\n" + "ldr q8, [%[outptr0], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr3], #0x60]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0x10]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0], #0x10]\n" + "ldr q9, [%[outptr1], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr4], #0x60]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x40]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1], #0x10]\n" + "ldr q10, [%[outptr2], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr5], #0x60]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x70]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2], #0x10]\n" + "ldr q11, [%[outptr3], #0x10]\n" + "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0xa0]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3], #0x10]\n" + "ldr q8, [%[outptr4], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr6], #0x60]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0xd0]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr4], #0x10]\n" + "ldr q9, [%[outptr5], #0x10]\n" + "prfm PLDL1KEEP, [%[outptr7], #0x60]\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x100]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr5], #0x10]\n" + "ldr q10, [%[outptr6], #0x10]\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x130]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr6], #0x10]\n" + "ldr q11, [%[outptr7], #0x10]\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0x160]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr7], #0x10]\n" + "ldr q8, [%[outptr0], #0x20]\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0x20]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr0], #0x20]\n" + "ldr q9, [%[outptr1], #0x20]\n" + "add %[outptr0], %[outptr0], #0x30\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x50]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr1], #0x20]\n" + "ldr q10, [%[outptr2], #0x20]\n" + "add %[outptr1], %[outptr1], #0x30\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x80]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr2], #0x20]\n" + "ldr q11, [%[outptr3], #0x20]\n" + "add %[outptr2], %[outptr2], #0x30\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0xb0]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr3], #0x20]\n" + "ldr q8, [%[outptr4], #0x20]\n" + "add %[outptr3], %[outptr3], #0x30\n" + "fmul v8.4s, v8.4s, %[beta].s[0]\n" + "ldr q4, [%[inptr], #0xe0]\n" + "fmla v8.4s, v4.4s, %[alpha].s[0]\n" + "str q8, [%[outptr4], #0x20]\n" + "ldr q9, [%[outptr5], #0x20]\n" + "add %[outptr4], %[outptr4], #0x30\n" + "fmul v9.4s, v9.4s, %[beta].s[0]\n" + "ldr q5, [%[inptr], #0x110]\n" + "fmla v9.4s, v5.4s, %[alpha].s[0]\n" + "str q9, [%[outptr5], #0x20]\n" + "ldr q10, [%[outptr6], #0x20]\n" + "add %[outptr5], %[outptr5], #0x30\n" + "fmul v10.4s, v10.4s, %[beta].s[0]\n" + "ldr q6, [%[inptr], #0x140]\n" + "fmla v10.4s, v6.4s, %[alpha].s[0]\n" + "str q10, [%[outptr6], #0x20]\n" + "ldr q11, [%[outptr7], #0x20]\n" + "add %[outptr6], %[outptr6], #0x30\n" + "fmul v11.4s, v11.4s, %[beta].s[0]\n" + "ldr q7, [%[inptr], #0x170]\n" + "fmla v11.4s, v7.4s, %[alpha].s[0]\n" + "str q11, [%[outptr7], #0x20]\n" + "add %[outptr7], %[outptr7], #0x30\n" + "add %[inptr], %[inptr], #0x180\n" + : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), + [inptr] "+r" (inptr) + : [alpha] "w" (alpha), [beta] "w" (beta) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory" + ); + } + } + break; + + + } + } + } + } +} + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp index e422b91c83..0330783a0b 100644 --- a/src/core/NEON/kernels/arm_gemm/transform.hpp +++ b/src/core/NEON/kernels/arm_gemm/transform.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -81,11 +81,14 @@ struct TransformImpl { } } // "row" tail - row is out of range so fill with zeros always. - for (int row = 0; row < blank_rows; row++) { - for (int col=0; col < (fill_cols + blank_cols); col++) { - *out++ = static_cast(0); - } + TOut zeroval = static_cast(0); + int pads = blank_rows * (fill_cols + blank_cols); + + for (int i=0; i @@ -173,4 +173,4 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T * } } -#endif // __aarch64__ +#endif // __aarch64__ && !__ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp index fc1f2c24f4..e1ebba077b 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -23,17 +23,14 @@ */ #include "a32_interleave_6way_32bit.hpp" #include "a32_transpose_interleave_8way_32bit.hpp" -#ifdef __ARM_FEATURE_SVE -#include "sve_interleave_8way_32bit.hpp" -#include "sve_interleave_8way_block2_32bit.hpp" -#include "sve_interleave_8way_block4_8bit.hpp" -#else -#include "a64_interleave_8way_32bit.hpp" -#endif #include "a64_block16_interleave4_8bit.hpp" #include "a64_interleave_8way_16bit.hpp" +#include "a64_interleave_8way_32bit.hpp" #include "a64_interleave_8way_half_to_float.hpp" #include "a64_transpose_interleave_12way_16bit.hpp" #include "a64_transpose_interleave_12way_half_to_float.hpp" #include "a64_transpose_interleave_24way_16bit.hpp" -#include "transpose_interleave_common.hpp" +#include "sve_interleave_8way_32bit.hpp" +#include "sve_interleave_8way_block2_32bit.hpp" +#include "sve_interleave_8way_block4_8bit.hpp" +#include "transpose_interleave_common.hpp" \ No newline at end of file diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp index 752e837f8d..07c8219c1b 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2018-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -41,7 +41,7 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T * long outpos = 0; uint32_t *outptr = master_outptr; - master_outptr += outwidth; + master_outptr += (outwidth * 1); const uint32_t *inptr0 = inptr + y * ldin + k0; const uint32_t *inptr1 = inptr0 + ldin; @@ -60,52 +60,53 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T * "whilelt p0.s, %[inpos], %[inwidth]\n" "b.none 2f\n" "mov z4.s, #0\n" - "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n" - "incw %[inpos], all, mul #1\n" - "whilelt p0.s, %[outpos], %[outwidth]\n" - "incw %[outpos], all, mul #1\n" + "ld1w z0.s, p0/z, [%[inptr0]]\n" "zip1 z8.s, z0.s, z4.s\n" + "incw %[inpos], all, mul #1\n" "zip2 z9.s, z0.s, z4.s\n" - "whilelt p1.s, %[outpos], %[outwidth]\n" - "incw %[outpos], all, mul #1\n" + "addvl %[inptr0], %[inptr0], #1\n" "zip1 z0.s, z8.s, z4.s\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip2 z1.s, z8.s, z4.s\n" + "incw %[outpos], all, mul #1\n" "zip1 z2.s, z9.s, z4.s\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" "zip2 z3.s, z9.s, z4.s\n" - "whilelt p2.s, %[outpos], %[outwidth]\n" - "zip1 z8.s, z0.s, z4.s\n" "incw %[outpos], all, mul #1\n" + "zip1 z8.s, z0.s, z4.s\n" + "st1w z8.s, p0, [%[outptr]]\n" "zip2 z9.s, z0.s, z4.s\n" + "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" "zip1 z10.s, z1.s, z4.s\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" "zip2 z11.s, z1.s, z4.s\n" - "st1w z8.s, p0, [%[outptr]]\n" + "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" "zip1 z12.s, z2.s, z4.s\n" - "whilelt p3.s, %[outpos], %[outwidth]\n" - "zip2 z13.s, z2.s, z4.s\n" "incw %[outpos], all, mul #1\n" + "zip2 z13.s, z2.s, z4.s\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" "zip1 z14.s, z3.s, z4.s\n" - "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" + "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" "zip2 z15.s, z3.s, z4.s\n" - "whilelt p4.s, %[outpos], %[outwidth]\n" - "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" - "whilelt p5.s, %[outpos], %[outwidth]\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" + "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n" - "whilelt p6.s, %[outpos], %[outwidth]\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" + "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n" - "whilelt p7.s, %[outpos], %[outwidth]\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" + "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n" + "incw %[outpos], all, mul #1\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" + "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n" - "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n" "addvl %[outptr], %[outptr], #8\n" "b 1b\n" "2:\n" : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0) : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth) - : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" ); break; @@ -115,60 +116,62 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T * "whilelt p0.s, %[inpos], %[inwidth]\n" "b.none 2f\n" "mov z4.s, #0\n" - "mov z14.s, #0\n" - "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n" - "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n" - "incw %[inpos], all, mul #1\n" - "whilelt p0.s, %[outpos], %[outwidth]\n" - "incw %[outpos], all, mul #1\n" + "ld1w z0.s, p0/z, [%[inptr0]]\n" "zip1 z8.s, z0.s, z4.s\n" + "ld1w z1.s, p0/z, [%[inptr1]]\n" "zip2 z9.s, z0.s, z4.s\n" + "incw %[inpos], all, mul #1\n" "zip1 z10.s, z1.s, z4.s\n" + "addvl %[inptr0], %[inptr0], #1\n" "zip2 z11.s, z1.s, z4.s\n" - "whilelt p1.s, %[outpos], %[outwidth]\n" + "addvl %[inptr1], %[inptr1], #1\n" "zip1 z0.s, z8.s, z4.s\n" - "incw %[outpos], all, mul #1\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip2 z1.s, z8.s, z4.s\n" + "incw %[outpos], all, mul #1\n" "zip1 z2.s, z9.s, z4.s\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" "zip2 z3.s, z9.s, z4.s\n" - "zip1 z4.s, z10.s, z14.s\n" + "incw %[outpos], all, mul #1\n" + "mov z14.s, #0\n" "whilelt p2.s, %[outpos], %[outwidth]\n" - "zip2 z5.s, z10.s, z14.s\n" + "zip1 z4.s, z10.s, z14.s\n" "incw %[outpos], all, mul #1\n" + "zip2 z5.s, z10.s, z14.s\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" "zip1 z6.s, z11.s, z14.s\n" + "incw %[outpos], all, mul #1\n" "zip2 z7.s, z11.s, z14.s\n" "zip1 z8.s, z0.s, z4.s\n" + "st1w z8.s, p0, [%[outptr]]\n" "zip2 z9.s, z0.s, z4.s\n" - "whilelt p3.s, %[outpos], %[outwidth]\n" + "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" "zip1 z10.s, z1.s, z5.s\n" - "incw %[outpos], all, mul #1\n" + "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" "zip2 z11.s, z1.s, z5.s\n" - "st1w z8.s, p0, [%[outptr]]\n" + "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" "zip1 z12.s, z2.s, z6.s\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip2 z13.s, z2.s, z6.s\n" + "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n" "zip1 z14.s, z3.s, z7.s\n" - "whilelt p4.s, %[outpos], %[outwidth]\n" - "zip2 z15.s, z3.s, z7.s\n" - "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" - "whilelt p5.s, %[outpos], %[outwidth]\n" + "zip2 z15.s, z3.s, z7.s\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" + "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" - "whilelt p6.s, %[outpos], %[outwidth]\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" + "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n" - "whilelt p7.s, %[outpos], %[outwidth]\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" + "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n" - "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n" - "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n" "addvl %[outptr], %[outptr], #8\n" "b 1b\n" "2:\n" : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1) : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth) - : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" ); break; @@ -178,63 +181,66 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T * "whilelt p0.s, %[inpos], %[inwidth]\n" "b.none 2f\n" "mov z4.s, #0\n" - "mov z14.s, #0\n" - "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n" - "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n" - "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n" - "incw %[inpos], all, mul #1\n" - "whilelt p0.s, %[outpos], %[outwidth]\n" + "ld1w z0.s, p0/z, [%[inptr0]]\n" "zip1 z8.s, z0.s, z4.s\n" - "incw %[outpos], all, mul #1\n" + "ld1w z1.s, p0/z, [%[inptr1]]\n" "zip2 z9.s, z0.s, z4.s\n" + "ld1w z2.s, p0/z, [%[inptr2]]\n" "zip1 z10.s, z1.s, z4.s\n" + "incw %[inpos], all, mul #1\n" "zip2 z11.s, z1.s, z4.s\n" + "addvl %[inptr0], %[inptr0], #1\n" "zip1 z12.s, z2.s, z4.s\n" - "whilelt p1.s, %[outpos], %[outwidth]\n" + "addvl %[inptr1], %[inptr1], #1\n" "zip2 z13.s, z2.s, z4.s\n" - "incw %[outpos], all, mul #1\n" - "zip1 z4.s, z10.s, z14.s\n" + "addvl %[inptr2], %[inptr2], #1\n" "zip1 z0.s, z8.s, z12.s\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip2 z1.s, z8.s, z12.s\n" + "incw %[outpos], all, mul #1\n" "zip1 z2.s, z9.s, z13.s\n" - "whilelt p2.s, %[outpos], %[outwidth]\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" "zip2 z3.s, z9.s, z13.s\n" "incw %[outpos], all, mul #1\n" + "mov z14.s, #0\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" + "zip1 z4.s, z10.s, z14.s\n" + "incw %[outpos], all, mul #1\n" "zip2 z5.s, z10.s, z14.s\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" "zip1 z6.s, z11.s, z14.s\n" + "incw %[outpos], all, mul #1\n" "zip2 z7.s, z11.s, z14.s\n" "zip1 z8.s, z0.s, z4.s\n" - "whilelt p3.s, %[outpos], %[outwidth]\n" + "st1w z8.s, p0, [%[outptr]]\n" "zip2 z9.s, z0.s, z4.s\n" - "incw %[outpos], all, mul #1\n" + "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" "zip1 z10.s, z1.s, z5.s\n" - "st1w z8.s, p0, [%[outptr]]\n" + "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" "zip2 z11.s, z1.s, z5.s\n" + "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" "zip1 z12.s, z2.s, z6.s\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip2 z13.s, z2.s, z6.s\n" - "whilelt p4.s, %[outpos], %[outwidth]\n" + "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n" "zip1 z14.s, z3.s, z7.s\n" - "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" - "zip2 z15.s, z3.s, z7.s\n" "incw %[outpos], all, mul #1\n" - "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" - "whilelt p5.s, %[outpos], %[outwidth]\n" + "zip2 z15.s, z3.s, z7.s\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" + "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" - "whilelt p6.s, %[outpos], %[outwidth]\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" + "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n" - "whilelt p7.s, %[outpos], %[outwidth]\n" - "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" + "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n" - "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n" "addvl %[outptr], %[outptr], #8\n" "b 1b\n" "2:\n" : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2) : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth) - : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" ); break; @@ -244,65 +250,69 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T * "whilelt p0.s, %[inpos], %[inwidth]\n" "b.none 2f\n" "mov z4.s, #0\n" - "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n" - "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n" - "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n" - "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n" - "incw %[inpos], all, mul #1\n" + "ld1w z0.s, p0/z, [%[inptr0]]\n" "zip1 z8.s, z0.s, z4.s\n" - "whilelt p0.s, %[outpos], %[outwidth]\n" + "ld1w z1.s, p0/z, [%[inptr1]]\n" "zip2 z9.s, z0.s, z4.s\n" - "incw %[outpos], all, mul #1\n" + "ld1w z2.s, p0/z, [%[inptr2]]\n" "zip1 z10.s, z1.s, z4.s\n" + "ld1w z3.s, p0/z, [%[inptr3]]\n" "zip2 z11.s, z1.s, z4.s\n" + "incw %[inpos], all, mul #1\n" "zip1 z12.s, z2.s, z4.s\n" + "addvl %[inptr0], %[inptr0], #1\n" "zip2 z13.s, z2.s, z4.s\n" - "whilelt p1.s, %[outpos], %[outwidth]\n" + "addvl %[inptr1], %[inptr1], #1\n" "zip1 z14.s, z3.s, z4.s\n" - "incw %[outpos], all, mul #1\n" + "addvl %[inptr2], %[inptr2], #1\n" "zip2 z15.s, z3.s, z4.s\n" + "addvl %[inptr3], %[inptr3], #1\n" "zip1 z0.s, z8.s, z12.s\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip2 z1.s, z8.s, z12.s\n" + "incw %[outpos], all, mul #1\n" "zip1 z2.s, z9.s, z13.s\n" - "whilelt p2.s, %[outpos], %[outwidth]\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" "zip2 z3.s, z9.s, z13.s\n" "incw %[outpos], all, mul #1\n" "zip1 z4.s, z10.s, z14.s\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" "zip2 z5.s, z10.s, z14.s\n" + "incw %[outpos], all, mul #1\n" "zip1 z6.s, z11.s, z15.s\n" - "zip2 z7.s, z11.s, z15.s\n" "whilelt p3.s, %[outpos], %[outwidth]\n" - "zip1 z8.s, z0.s, z4.s\n" + "zip2 z7.s, z11.s, z15.s\n" "incw %[outpos], all, mul #1\n" + "zip1 z8.s, z0.s, z4.s\n" + "st1w z8.s, p0, [%[outptr]]\n" "zip2 z9.s, z0.s, z4.s\n" + "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" "zip1 z10.s, z1.s, z5.s\n" + "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" "zip2 z11.s, z1.s, z5.s\n" - "st1w z8.s, p0, [%[outptr]]\n" + "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" "zip1 z12.s, z2.s, z6.s\n" - "whilelt p4.s, %[outpos], %[outwidth]\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip2 z13.s, z2.s, z6.s\n" - "incw %[outpos], all, mul #1\n" + "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n" "zip1 z14.s, z3.s, z7.s\n" - "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" + "incw %[outpos], all, mul #1\n" "zip2 z15.s, z3.s, z7.s\n" - "whilelt p5.s, %[outpos], %[outwidth]\n" - "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" + "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" - "whilelt p6.s, %[outpos], %[outwidth]\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" + "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n" - "whilelt p7.s, %[outpos], %[outwidth]\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" + "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n" - "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n" - "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n" "addvl %[outptr], %[outptr], #8\n" "b 1b\n" "2:\n" : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3) : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth) - : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" ); break; @@ -312,66 +322,71 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T * "whilelt p0.s, %[inpos], %[inwidth]\n" "b.none 2f\n" "mov z5.s, #0\n" - "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n" - "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n" - "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n" - "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n" - "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n" + "ld1w z0.s, p0/z, [%[inptr0]]\n" + "ld1w z1.s, p0/z, [%[inptr1]]\n" "incw %[inpos], all, mul #1\n" "zip1 z10.s, z1.s, z5.s\n" - "whilelt p0.s, %[outpos], %[outwidth]\n" + "ld1w z2.s, p0/z, [%[inptr2]]\n" "zip2 z11.s, z1.s, z5.s\n" - "incw %[outpos], all, mul #1\n" + "ld1w z3.s, p0/z, [%[inptr3]]\n" + "zip1 z12.s, z2.s, z5.s\n" + "ld1w z4.s, p0/z, [%[inptr4]]\n" "zip1 z8.s, z0.s, z4.s\n" + "addvl %[inptr0], %[inptr0], #1\n" "zip2 z9.s, z0.s, z4.s\n" - "zip1 z12.s, z2.s, z5.s\n" + "addvl %[inptr1], %[inptr1], #1\n" "zip2 z13.s, z2.s, z5.s\n" - "whilelt p1.s, %[outpos], %[outwidth]\n" + "addvl %[inptr2], %[inptr2], #1\n" "zip1 z14.s, z3.s, z5.s\n" - "incw %[outpos], all, mul #1\n" + "addvl %[inptr3], %[inptr3], #1\n" "zip2 z15.s, z3.s, z5.s\n" + "addvl %[inptr4], %[inptr4], #1\n" "zip1 z0.s, z8.s, z12.s\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip2 z1.s, z8.s, z12.s\n" + "incw %[outpos], all, mul #1\n" "zip1 z2.s, z9.s, z13.s\n" - "whilelt p2.s, %[outpos], %[outwidth]\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" "zip2 z3.s, z9.s, z13.s\n" "incw %[outpos], all, mul #1\n" "zip1 z4.s, z10.s, z14.s\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" "zip2 z5.s, z10.s, z14.s\n" + "incw %[outpos], all, mul #1\n" "zip1 z6.s, z11.s, z15.s\n" - "zip2 z7.s, z11.s, z15.s\n" "whilelt p3.s, %[outpos], %[outwidth]\n" - "zip1 z8.s, z0.s, z4.s\n" + "zip2 z7.s, z11.s, z15.s\n" "incw %[outpos], all, mul #1\n" + "zip1 z8.s, z0.s, z4.s\n" + "st1w z8.s, p0, [%[outptr]]\n" "zip2 z9.s, z0.s, z4.s\n" + "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" "zip1 z10.s, z1.s, z5.s\n" + "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" "zip2 z11.s, z1.s, z5.s\n" - "st1w z8.s, p0, [%[outptr]]\n" + "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" "zip1 z12.s, z2.s, z6.s\n" - "whilelt p4.s, %[outpos], %[outwidth]\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip2 z13.s, z2.s, z6.s\n" - "incw %[outpos], all, mul #1\n" + "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n" "zip1 z14.s, z3.s, z7.s\n" - "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" + "incw %[outpos], all, mul #1\n" "zip2 z15.s, z3.s, z7.s\n" - "whilelt p5.s, %[outpos], %[outwidth]\n" - "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" + "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" - "whilelt p6.s, %[outpos], %[outwidth]\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" + "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n" - "whilelt p7.s, %[outpos], %[outwidth]\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" + "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n" - "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n" - "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n" "addvl %[outptr], %[outptr], #8\n" "b 1b\n" "2:\n" : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4) : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth) - : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" ); break; @@ -381,67 +396,73 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T * "whilelt p0.s, %[inpos], %[inwidth]\n" "b.none 2f\n" "mov z6.s, #0\n" - "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n" - "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n" - "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n" - "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n" - "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n" - "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n" + "ld1w z0.s, p0/z, [%[inptr0]]\n" + "ld1w z1.s, p0/z, [%[inptr1]]\n" "incw %[inpos], all, mul #1\n" + "ld1w z2.s, p0/z, [%[inptr2]]\n" + "addvl %[inptr0], %[inptr0], #1\n" "zip1 z12.s, z2.s, z6.s\n" - "whilelt p0.s, %[outpos], %[outwidth]\n" + "ld1w z3.s, p0/z, [%[inptr3]]\n" + "zip2 z13.s, z2.s, z6.s\n" + "ld1w z4.s, p0/z, [%[inptr4]]\n" "zip1 z8.s, z0.s, z4.s\n" - "incw %[outpos], all, mul #1\n" + "ld1w z5.s, p0/z, [%[inptr5]]\n" "zip2 z9.s, z0.s, z4.s\n" + "addvl %[inptr1], %[inptr1], #1\n" "zip1 z10.s, z1.s, z5.s\n" + "addvl %[inptr2], %[inptr2], #1\n" "zip2 z11.s, z1.s, z5.s\n" - "zip2 z13.s, z2.s, z6.s\n" - "whilelt p1.s, %[outpos], %[outwidth]\n" + "addvl %[inptr3], %[inptr3], #1\n" "zip1 z14.s, z3.s, z6.s\n" - "incw %[outpos], all, mul #1\n" + "addvl %[inptr4], %[inptr4], #1\n" "zip2 z15.s, z3.s, z6.s\n" + "addvl %[inptr5], %[inptr5], #1\n" "zip1 z0.s, z8.s, z12.s\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip2 z1.s, z8.s, z12.s\n" + "incw %[outpos], all, mul #1\n" "zip1 z2.s, z9.s, z13.s\n" - "whilelt p2.s, %[outpos], %[outwidth]\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" "zip2 z3.s, z9.s, z13.s\n" "incw %[outpos], all, mul #1\n" "zip1 z4.s, z10.s, z14.s\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" "zip2 z5.s, z10.s, z14.s\n" + "incw %[outpos], all, mul #1\n" "zip1 z6.s, z11.s, z15.s\n" - "zip2 z7.s, z11.s, z15.s\n" "whilelt p3.s, %[outpos], %[outwidth]\n" - "zip1 z8.s, z0.s, z4.s\n" + "zip2 z7.s, z11.s, z15.s\n" "incw %[outpos], all, mul #1\n" + "zip1 z8.s, z0.s, z4.s\n" + "st1w z8.s, p0, [%[outptr]]\n" "zip2 z9.s, z0.s, z4.s\n" + "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" "zip1 z10.s, z1.s, z5.s\n" + "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" "zip2 z11.s, z1.s, z5.s\n" - "st1w z8.s, p0, [%[outptr]]\n" + "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" "zip1 z12.s, z2.s, z6.s\n" - "whilelt p4.s, %[outpos], %[outwidth]\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip2 z13.s, z2.s, z6.s\n" - "incw %[outpos], all, mul #1\n" + "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n" "zip1 z14.s, z3.s, z7.s\n" - "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" + "incw %[outpos], all, mul #1\n" "zip2 z15.s, z3.s, z7.s\n" - "whilelt p5.s, %[outpos], %[outwidth]\n" - "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" + "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" - "whilelt p6.s, %[outpos], %[outwidth]\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" + "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n" - "whilelt p7.s, %[outpos], %[outwidth]\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" + "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n" - "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n" - "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n" "addvl %[outptr], %[outptr], #8\n" "b 1b\n" "2:\n" : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5) : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth) - : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" ); break; @@ -451,68 +472,75 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T * "whilelt p0.s, %[inpos], %[inwidth]\n" "b.none 2f\n" "mov z7.s, #0\n" - "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n" - "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n" - "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n" - "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n" - "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n" - "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n" - "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n" + "ld1w z0.s, p0/z, [%[inptr0]]\n" + "ld1w z1.s, p0/z, [%[inptr1]]\n" "incw %[inpos], all, mul #1\n" + "ld1w z2.s, p0/z, [%[inptr2]]\n" + "addvl %[inptr0], %[inptr0], #1\n" + "ld1w z3.s, p0/z, [%[inptr3]]\n" + "addvl %[inptr1], %[inptr1], #1\n" "zip1 z14.s, z3.s, z7.s\n" - "whilelt p0.s, %[outpos], %[outwidth]\n" + "ld1w z4.s, p0/z, [%[inptr4]]\n" "zip1 z8.s, z0.s, z4.s\n" - "incw %[outpos], all, mul #1\n" + "ld1w z5.s, p0/z, [%[inptr5]]\n" "zip2 z9.s, z0.s, z4.s\n" + "ld1w z6.s, p0/z, [%[inptr6]]\n" "zip1 z10.s, z1.s, z5.s\n" + "addvl %[inptr2], %[inptr2], #1\n" "zip2 z11.s, z1.s, z5.s\n" + "addvl %[inptr3], %[inptr3], #1\n" "zip1 z12.s, z2.s, z6.s\n" - "whilelt p1.s, %[outpos], %[outwidth]\n" + "addvl %[inptr4], %[inptr4], #1\n" "zip2 z13.s, z2.s, z6.s\n" - "incw %[outpos], all, mul #1\n" + "addvl %[inptr5], %[inptr5], #1\n" "zip2 z15.s, z3.s, z7.s\n" + "addvl %[inptr6], %[inptr6], #1\n" "zip1 z0.s, z8.s, z12.s\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip2 z1.s, z8.s, z12.s\n" + "incw %[outpos], all, mul #1\n" "zip1 z2.s, z9.s, z13.s\n" - "whilelt p2.s, %[outpos], %[outwidth]\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" "zip2 z3.s, z9.s, z13.s\n" "incw %[outpos], all, mul #1\n" "zip1 z4.s, z10.s, z14.s\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" "zip2 z5.s, z10.s, z14.s\n" + "incw %[outpos], all, mul #1\n" "zip1 z6.s, z11.s, z15.s\n" - "zip2 z7.s, z11.s, z15.s\n" "whilelt p3.s, %[outpos], %[outwidth]\n" - "zip1 z8.s, z0.s, z4.s\n" + "zip2 z7.s, z11.s, z15.s\n" "incw %[outpos], all, mul #1\n" + "zip1 z8.s, z0.s, z4.s\n" + "st1w z8.s, p0, [%[outptr]]\n" "zip2 z9.s, z0.s, z4.s\n" + "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" "zip1 z10.s, z1.s, z5.s\n" + "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" "zip2 z11.s, z1.s, z5.s\n" - "st1w z8.s, p0, [%[outptr]]\n" + "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" "zip1 z12.s, z2.s, z6.s\n" - "whilelt p4.s, %[outpos], %[outwidth]\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip2 z13.s, z2.s, z6.s\n" - "incw %[outpos], all, mul #1\n" + "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n" "zip1 z14.s, z3.s, z7.s\n" - "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" + "incw %[outpos], all, mul #1\n" "zip2 z15.s, z3.s, z7.s\n" - "whilelt p5.s, %[outpos], %[outwidth]\n" - "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" + "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" - "whilelt p6.s, %[outpos], %[outwidth]\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" + "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n" - "whilelt p7.s, %[outpos], %[outwidth]\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" + "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n" - "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n" - "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n" "addvl %[outptr], %[outptr], #8\n" "b 1b\n" "2:\n" : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6) : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth) - : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" ); break; @@ -522,69 +550,77 @@ inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T * "1:\n" "whilelt p0.s, %[inpos], %[inwidth]\n" "b.none 2f\n" - "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n" - "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n" - "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n" - "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n" - "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n" - "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n" - "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n" - "ld1w z7.s, p0/z, [%[inptr7], %[inpos], LSL #2]\n" + "ld1w z0.s, p0/z, [%[inptr0]]\n" "incw %[inpos], all, mul #1\n" + "ld1w z1.s, p0/z, [%[inptr1]]\n" + "addvl %[inptr0], %[inptr0], #1\n" + "ld1w z2.s, p0/z, [%[inptr2]]\n" + "addvl %[inptr1], %[inptr1], #1\n" + "ld1w z3.s, p0/z, [%[inptr3]]\n" + "addvl %[inptr2], %[inptr2], #1\n" + "ld1w z4.s, p0/z, [%[inptr4]]\n" + "addvl %[inptr3], %[inptr3], #1\n" "zip1 z8.s, z0.s, z4.s\n" - "whilelt p0.s, %[outpos], %[outwidth]\n" + "ld1w z5.s, p0/z, [%[inptr5]]\n" "zip2 z9.s, z0.s, z4.s\n" - "incw %[outpos], all, mul #1\n" + "ld1w z6.s, p0/z, [%[inptr6]]\n" "zip1 z10.s, z1.s, z5.s\n" + "ld1w z7.s, p0/z, [%[inptr7]]\n" "zip2 z11.s, z1.s, z5.s\n" + "addvl %[inptr4], %[inptr4], #1\n" "zip1 z12.s, z2.s, z6.s\n" + "addvl %[inptr5], %[inptr5], #1\n" "zip2 z13.s, z2.s, z6.s\n" - "whilelt p1.s, %[outpos], %[outwidth]\n" + "addvl %[inptr6], %[inptr6], #1\n" "zip1 z14.s, z3.s, z7.s\n" - "incw %[outpos], all, mul #1\n" + "addvl %[inptr7], %[inptr7], #1\n" "zip2 z15.s, z3.s, z7.s\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip1 z0.s, z8.s, z12.s\n" + "incw %[outpos], all, mul #1\n" "zip2 z1.s, z8.s, z12.s\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" "zip1 z2.s, z9.s, z13.s\n" - "whilelt p2.s, %[outpos], %[outwidth]\n" - "zip2 z3.s, z9.s, z13.s\n" "incw %[outpos], all, mul #1\n" + "zip2 z3.s, z9.s, z13.s\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" "zip1 z4.s, z10.s, z14.s\n" + "incw %[outpos], all, mul #1\n" "zip2 z5.s, z10.s, z14.s\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" "zip1 z6.s, z11.s, z15.s\n" + "incw %[outpos], all, mul #1\n" "zip2 z7.s, z11.s, z15.s\n" - "whilelt p3.s, %[outpos], %[outwidth]\n" "zip1 z8.s, z0.s, z4.s\n" - "incw %[outpos], all, mul #1\n" + "st1w z8.s, p0, [%[outptr]]\n" "zip2 z9.s, z0.s, z4.s\n" + "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" "zip1 z10.s, z1.s, z5.s\n" + "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" "zip2 z11.s, z1.s, z5.s\n" - "st1w z8.s, p0, [%[outptr]]\n" + "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" "zip1 z12.s, z2.s, z6.s\n" - "whilelt p4.s, %[outpos], %[outwidth]\n" + "whilelt p0.s, %[outpos], %[outwidth]\n" "zip2 z13.s, z2.s, z6.s\n" - "incw %[outpos], all, mul #1\n" + "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n" "zip1 z14.s, z3.s, z7.s\n" - "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n" + "incw %[outpos], all, mul #1\n" "zip2 z15.s, z3.s, z7.s\n" - "whilelt p5.s, %[outpos], %[outwidth]\n" - "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n" + "whilelt p1.s, %[outpos], %[outwidth]\n" + "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n" - "whilelt p6.s, %[outpos], %[outwidth]\n" + "whilelt p2.s, %[outpos], %[outwidth]\n" + "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n" - "whilelt p7.s, %[outpos], %[outwidth]\n" + "whilelt p3.s, %[outpos], %[outwidth]\n" + "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n" "incw %[outpos], all, mul #1\n" - "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n" - "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n" - "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n" "addvl %[outptr], %[outptr], #8\n" "b 1b\n" "2:\n" : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7) : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth) - : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory" ); break; diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp index a1fc00ea89..8b96c328a6 100644 --- a/src/core/NEON/kernels/arm_gemm/utils.hpp +++ b/src/core/NEON/kernels/arm_gemm/utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -24,9 +24,7 @@ #pragma once -#ifdef __ARM_FEATURE_SVE -#include -#endif +#include // Macro for unreachable code (e.g. impossible default cases on switch) #define UNREACHABLE(why) __builtin_unreachable() @@ -49,13 +47,43 @@ inline T roundup(const T a, const T b) { } } +namespace arm_gemm { +namespace utils { +namespace { + +#ifdef __ARM_FEATURE_SVE +template +inline unsigned long get_vector_length_sz() { + unsigned long v; + + __asm ( + "cntb %0" + : "=r" (v) + ); + + return v / sz; +} + +#define VEC_LEN_SPEC(sz, opcode) template <> inline unsigned long get_vector_length_sz() { unsigned long v; __asm ( opcode " %0" : "=r" (v)); return v; } + +VEC_LEN_SPEC(8, "cntd") +VEC_LEN_SPEC(4, "cntw") +VEC_LEN_SPEC(2, "cnth") +VEC_LEN_SPEC(1, "cntb") +#endif + +} // anonymous namespace + template inline unsigned long get_vector_length() { #ifdef __ARM_FEATURE_SVE - const unsigned long length = svcntb(); + return get_vector_length_sz(); #else - const unsigned long length = 16; + return 16 / sizeof(T); #endif +} + +} // utils namespace +} // arm_gemm namespace - return length / sizeof(T); -} \ No newline at end of file +using namespace arm_gemm::utils; \ No newline at end of file diff --git a/src/core/NEON/kernels/assembly/Helpers.cpp b/src/core/NEON/kernels/assembly/Helpers.cpp index 09ac08c0a4..3d8d66d7fc 100644 --- a/src/core/NEON/kernels/assembly/Helpers.cpp +++ b/src/core/NEON/kernels/assembly/Helpers.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -24,91 +24,47 @@ #include "arm_compute/core/NEON/kernels/assembly/Helpers.h" -#include "NEGEMMInterleavedStrategies.h" +#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp" namespace arm_compute { -namespace -{ -template -BlockSizes calculate_block_sizes_template(const CPUInfo &ci, unsigned int M, unsigned int N, unsigned int K) -{ - using strategy = typename Kernel::strategy; - return calculate_block_sizes(ci, M, N, K); -} -} // namespace - -const char *get_strategy_name(DataType input_type, bool use_dot) +arm_gemm::KernelDescription get_gemm_info(DataType input_type, + const CPUInfo &ci, + const unsigned int num_threads, + const INEGEMMWrapperKernel::Params &p, + float alpha, + float beta, + bool pretranspose_hint) { switch(input_type) { - case DataType::F32: - return Kernel::name; #ifdef __aarch64__ - case DataType::U8: case DataType::QASYMM8: - if(use_dot) - { - return Kernel::name; - } - else - { - return Kernel::name; - } - case DataType::S8: - if(use_dot) - { - return Kernel::name; - } - else - { - return Kernel::name; - } -#endif /* __aarch64__ */ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - return Kernel<__fp16>::name; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - ARM_COMPUTE_ERROR("DataType not supported"); - break; - } -} - -BlockSizes calculate_block_sizes_from_data_type(const CPUInfo &ci, unsigned int M, unsigned int N, unsigned int K, DataType input_type, bool use_dot) -{ - switch(input_type) - { - case DataType::F32: - return calculate_block_sizes_template(ci, M, N, K); -#ifdef __aarch64__ case DataType::U8: - case DataType::QASYMM8: - if(use_dot) - { - return calculate_block_sizes_template(ci, M, N, K); - } - else - { - return calculate_block_sizes_template(ci, M, N, K); - } + { + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint); + return arm_gemm::get_gemm_method(args); + } case DataType::S8: - if(use_dot) - { - return calculate_block_sizes_template(ci, M, N, K); - } - else - { - return calculate_block_sizes_template(ci, M, N, K); - } -#endif /* __aarch64__ */ + { + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint); + return arm_gemm::get_gemm_method(args); + } +#endif // __aarch64__ #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - return calculate_block_sizes_template<__fp16>(ci, M, N, K); + { + arm_gemm::GemmArgs<__fp16> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint); + return arm_gemm::get_gemm_method<__fp16, __fp16>(args); + } #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + case DataType::F32: + { + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint); + return arm_gemm::get_gemm_method(args); + } default: - ARM_COMPUTE_ERROR("DataType not supported"); - break; + return arm_gemm::KernelDescription(); } } } // namespace arm_compute diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp deleted file mode 100644 index 3b2975dd80..0000000000 --- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) 2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h" - -#include "NEGEMMInterleavedStrategies.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/WindowIterator.h" - -namespace arm_compute -{ -template -void NEGEMMInterleavedMatrixMultiplyWrapperTemplate::configure(const ITensor *prepared_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker, - const BlockSizes &block_sizes, const INEGEMMWrapperKernel::Params ¶ms, bool b_is_pretransposed, float alpha, float beta, unsigned int max_num_threads) -{ - using strategy = typename Kernel::strategy; - - _prepared_a = prepared_a; - _transformed_b = transformed_b; - _tmp_c = tmp_c; - _c = c; - _block_walker = block_walker; - _block_sizes = block_sizes; - _params = params; - _b_is_pretransposed = b_is_pretransposed; - _alpha = alpha; - _beta = beta; - - auto_init_if_empty(*_tmp_c->info(), c->info()->clone()->set_tensor_shape(TensorShape{ _block_sizes.x_block * strategy::out_height(), max_num_threads })); -} - -template -void NEGEMMInterleavedMatrixMultiplyWrapperTemplate::transform(const MatrixMultiplyWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset, - const Coordinates &end_offset) -{ - using strategy = typename Kernel::strategy; - - strategy strat(info.cpu_info); - TensorAccessor prepared_a(*_prepared_a); - TensorAccessor transformed_b(*_transformed_b); - TensorAccessor c(*_c); - TensorAccessor tmp_c(*_tmp_c); - - int prev_batch = -1; - To *a_ptr = nullptr; - auto window_iterator = arm_compute::create_window_iterator(batch_window, start_offset, end_offset, [&](const Coordinates & id) - { - const unsigned int y = id.x(); - const unsigned int batch = id.y(); - const unsigned int ymax = std::min(_params.M, y + strategy::out_height()); - - // If it's the first block of a new batch then reset the pointer to A. - if(prev_batch != static_cast(batch)) - { - const unsigned int first_m = id.x(); - a_ptr = prepared_a(0, first_m, batch); - prev_batch = batch; - } - - // Call matrix multiply assembly routine to process the block: - strat.kernel(a_ptr, transformed_b(wl._offset_transformed_b), tmp_c(0, info.thread_id), 1, wl._bblocks, wl._kern_k); - a_ptr += strategy::out_height() * wl._kern_k; - - // Merge the result with the other blocks' results: - strat.transforms.Merge(c(0, 0, batch, wl._multi), tmp_c(0, info.thread_id), c.stride(1), y, ymax, wl._x0, wl._xmax, _alpha, (wl._k0 == 0 ? _beta : static_cast(1))); - }); - auto on_new_row_size = [&](unsigned int start, unsigned int end) - { - //Nothing to do - }; - window_iterator.iterate_2D(on_new_row_size); -} - -template -void NEGEMMInterleavedMatrixMultiplyWrapperTemplate::create_workloads(std::vector &workloads) -{ - using strategy = typename Kernel::strategy; - - unsigned int offset_transformed_b = 0; - unsigned int wl_index = 0; - unsigned int num_buffers = 0, reshaped_block_size = 0; - - if(!_b_is_pretransposed) - { - num_buffers = _transformed_b->info()->tensor_shape()[1]; - reshaped_block_size = _transformed_b->info()->tensor_shape()[0]; - } - execute_window_loop(_block_walker, [&](const Coordinates & id) - { - const unsigned int x0 = id.x(); - const unsigned int k0 = id.y(); - const unsigned int multi = id.z(); - - const unsigned int xmax = std::min(x0 + _block_walker.x().step(), _params.N); - const unsigned int kmax = std::min(k0 + _block_walker.y().step(), _params.K); - - // Figure out how many "K" the kernel will actually process. - const int kern_k = ceil_to_multiple(kmax - k0, strategy::k_unroll()); - const int bblocks = DIV_CEIL(xmax - x0, strategy::out_width()); - - workloads.push_back(MatrixMultiplyWorkload(offset_transformed_b, x0, xmax, k0, kmax, multi, kern_k, bblocks)); - - if(_b_is_pretransposed) - { - offset_transformed_b += bblocks * strategy::out_width() * kern_k; - } - else - { - // Rotate through the BufferManager's buffers: - wl_index++; - offset_transformed_b = (wl_index % num_buffers) * reshaped_block_size; - } - }); -} - -//TODO: regroup somewhere ? -template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate; -#ifdef __aarch64__ -template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate; -template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate; -template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate; -template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate; -#endif /* __aarch64__ */ - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -} // namespace arm_compute diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp deleted file mode 100644 index 7fc57f3c02..0000000000 --- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright (c) 2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h" - -#include "NEGEMMInterleavedStrategies.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" - -namespace arm_compute -{ -namespace -{ -// Call the lambda function for each workload generated by the passed window. -template -void for_each_element_in_window(const Window &window, const ITensor *b, ITensor *transformed_b, unsigned int N, unsigned int K, Lambda &&lambda) -{ - using strategy = typename Kernel::strategy; - unsigned int wl_index = 0; - unsigned int num_buffers = 0, reshaped_block_size = 0; - - if(use_buffer_manager) - { - num_buffers = transformed_b->info()->tensor_shape()[1]; - reshaped_block_size = transformed_b->info()->strides_in_bytes().y(); - } - - unsigned int offset_transformed_b = transformed_b->info()->offset_first_element_in_bytes(); - execute_window_loop(window, [&](const Coordinates & coordinates) - { - const unsigned int x0 = coordinates.x(); - const unsigned int k0 = coordinates.y(); - const unsigned int multi = coordinates.z(); - - const unsigned int offset_b = b->info()->offset_element_in_bytes(Coordinates(0, 0, multi)); - const unsigned int xmax = std::min(x0 + window.x().step(), N); - const unsigned int kmax = std::min(k0 + window.y().step(), K); - - /* Figure out the size of each block. */ - unsigned int x_size = (xmax - x0); - unsigned int k_size = (kmax - k0); - - /* Round sizes up as needed. */ - x_size = ceil_to_multiple(x_size, strategy::out_width()); - k_size = ceil_to_multiple(k_size, strategy::k_unroll()); - - lambda(PrepareBWorkload(offset_b, offset_transformed_b, x0, xmax, k0, kmax)); - - //Each workload represents one block: - if(use_buffer_manager) - { - // Rotate through the BufferManager's buffers: - wl_index++; - offset_transformed_b = (wl_index % num_buffers) * reshaped_block_size; - } - else - { - offset_transformed_b += (x_size * k_size * sizeof(To)); - } - }); -} - -// Calculate the size of transformed_b: -template -unsigned int get_B_pretransposed_array_size(unsigned int N, unsigned int K, const BlockSizes &bs, unsigned int multis) -{ - using strategy = typename Kernel::strategy; - - // How many full blocks do N / K contain ? - size_t num_full_k = K / bs.k_block; - size_t num_full_x = N / bs.x_block; - - ARM_COMPUTE_ERROR_ON(bs.x_block % strategy::out_width() != 0); - ARM_COMPUTE_ERROR_ON(bs.k_block % strategy::k_unroll() != 0); - - size_t normal_x_size = bs.x_block; - size_t normal_k_size = bs.k_block; - - // Round up the leftovers to be a multiple of the strategy processing size: - size_t left_over_x_size = ceil_to_multiple(N % bs.x_block, strategy::out_width()); - size_t left_over_k_size = ceil_to_multiple(K % bs.k_block, strategy::k_unroll()); - - // Calculate the total size of the buffer: - size_t total = num_full_k * normal_k_size * (num_full_x * normal_x_size + left_over_x_size); - total += left_over_k_size * (left_over_x_size + num_full_x * normal_x_size); - - total *= multis; - - return total; -} - -} // namespace - -template -BlockSizes NEGEMMInterleavedPrepareBWrapperKernelTemplate::block_sizes() const -{ - return _block_sizes; -} - -template -void NEGEMMInterleavedPrepareBWrapperKernelTemplate::configure(const ITensor *b, ITensor *transformed_b, bool transpose_b, const CPUInfo &ci, const INEGEMMWrapperKernel::Params ¶ms) -{ - using strategy = typename Kernel::strategy; - - const unsigned int multis = b->info()->tensor_shape().z(); - _Nsize = b->info()->tensor_shape().x(); - _Ksize = b->info()->tensor_shape().y(); - _b = b; - _transformed_b = transformed_b; - _transpose_b = transpose_b; - - _block_sizes = calculate_block_sizes(ci, params.M, params.N, params.K); - - auto_init_if_empty(*transformed_b->info(), b->info()->clone()->set_tensor_shape(TensorShape{ get_B_pretransposed_array_size(_Nsize, _Ksize, _block_sizes, multis) })); - - Window window; - window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_Nsize, _block_sizes.x_block), _block_sizes.x_block)); - window.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_Ksize, _block_sizes.k_block), _block_sizes.k_block)); - window.set(Window::DimZ, Window::Dimension(0, multis)); - - INEKernel::configure(window); -} - -template -void NEGEMMInterleavedPrepareBWrapperKernelTemplate::transform(const PrepareBWorkload &wl, const ThreadInfo &info) -{ - using strategy = typename Kernel::strategy; - - strategy strat(info.cpu_info); - strat.transforms.PrepareB(reinterpret_cast(_transformed_b->buffer() + wl._offset_transformed_b), - reinterpret_cast(_b->buffer() + wl._offset_b), - _b->info()->strides_in_bytes().y() / sizeof(To), - wl._x0, wl._xmax, wl._k0, wl._kmax, _transpose_b); -} - -template -void NEGEMMInterleavedPrepareBWrapperKernelTemplate::create_workloads(std::vector &workloads) -{ - for_each_element_in_window(window(), _b, _transformed_b, _Nsize, _Ksize, [&workloads](PrepareBWorkload && wl) - { - workloads.push_back(std::move(wl)); - }); -} - -template -void NEGEMMInterleavedPrepareBWrapperKernelTemplate::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(window, INEKernel::window()); - for_each_element_in_window(window, _b, _transformed_b, _Nsize, _Ksize, [&](PrepareBWorkload && wl) - { - this->transform(wl, info); - }); -} - -template class NEGEMMInterleavedPrepareBWrapperKernelTemplate; -#ifdef __aarch64__ -template class NEGEMMInterleavedPrepareBWrapperKernelTemplate; -template class NEGEMMInterleavedPrepareBWrapperKernelTemplate; -template class NEGEMMInterleavedPrepareBWrapperKernelTemplate; -template class NEGEMMInterleavedPrepareBWrapperKernelTemplate; -#endif /* __aarch64__ */ - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template class NEGEMMInterleavedPrepareBWrapperKernelTemplate; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -} // namespace arm_compute diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h b/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h index 69842fec80..da6ef2dea9 100644 --- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h +++ b/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -44,90 +44,175 @@ namespace arm_compute { -namespace +namespace detail { -template -struct Kernel +/** GEMM Interleaved Strategy interface */ +class IInterleavedStrategy { +public: + /** Virtual Destructor */ + virtual ~IInterleavedStrategy() = default; + /** Instantiate and configure a prepareB Kernel + * + * @param[in] b Input tensor B. + * @param[in] transformed_b Reshaped tensor B. + * @param[in] params GM, N, K sizes. + * @param[in] ci CPUInfo to be used for kernel configuration. + * + * @return A wrapped specialized prepareB kernel + */ + virtual std::unique_ptr instantiate_prepareB(const ITensor *b, + ITensor *transformed_b, + const INEGEMMWrapperKernel::Params ¶ms, + const CPUInfo &ci) = 0; + /** Instantiate and configure a transformA Kernel + * + * @param[in] a Input tensor A. + * @param[in] transformed_a Reshaped tensor A. + * @param[in] block_walker Window representing the layout of the matrix's blocks. + * @param[in] params M, N, K sizes. + * + * @return A wrapped specialized transformA kernel + */ + virtual std::unique_ptr instantiate_transformA(const ITensor *a, + ITensor *transformed_a, + const Window &block_walker, + const INEGEMMWrapperKernel::Params ¶ms) = 0; + /** Instantiate and configure a prepareB Kernel + * + * @param transformed_a Already reshaped tensor A. + * @param transformed_b Already reshaped tensor B. + * @param tmp_c Temporary buffer to be used to store intermediate results. + * @param c Result tensor C. + * @param block_walker Window containing iteration information for the M and batch dimensions. + * @param block_sizes Block sizes to use for the matrix multiplication (A & B must have been reshaped using these same block sizes). + * @param params M, N, K sizes. + * @param alpha Alpha value + * @param beta Beta value + * @param pretranspose_b Is B also pretransposed ? + * @param num_threads Maximum number of threads that might be used for the calculations. + * + * @return A wrapped specialized MatrixMultiply kernel + */ + virtual std::unique_ptr instantiate_matrix_multiply(const ITensor *transformed_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, + const Window &block_walker, const BlockSizes &block_sizes, + const INEGEMMWrapperKernel::Params ¶ms, float alpha, float beta, bool pretranspose_b, + unsigned int num_threads) = 0; + /** Calculates the block sizes of a given strategy + * + * @param[in] ci CPUInfo to be used for kernel configuration. + * @param[in] params M, N, K sizes. + * + * @return BlockSizes for a given strategy + */ + virtual BlockSizes calculate_block_sizes_for_strategy(const CPUInfo &ci, const INEGEMMWrapperKernel::Params ¶ms) = 0; }; -#define DEFINE_STRATEGY_SUFFIX(strat, suffix) \ - using strategy = arm_gemm::strat; \ - static constexpr const char *name = #strat suffix; - -#define DEFINE_STRATEGY(strat) \ - DEFINE_STRATEGY_SUFFIX(strat, "") - -#ifdef __ARM_FEATURE_SVE -template <> -struct Kernel -{ - DEFINE_STRATEGY(interleaved_fp32_mla_3VLx8) -}; -template <> -struct Kernel -{ - DEFINE_STRATEGY(interleaved_fp16_mla_3VLx8) -}; -template -struct Kernel -{ - DEFINE_STRATEGY(interleaved_s8s32_dot_3VLx8) -}; -template -struct Kernel +/** Interleaved Strategy class */ +template +class InterleavedStrategy : public IInterleavedStrategy { - DEFINE_STRATEGY(interleaved_u8u32_dot_3VLx8) -}; -#else /* __ARM_FEATURE_SVE */ +public: + using strategy = StrategyType; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template <> -struct Kernel -{ - DEFINE_STRATEGY(hgemm_24x8) -}; -#endif /*__ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -#ifdef __aarch64__ -template <> -struct Kernel -{ - DEFINE_STRATEGY(sgemm_12x8) -}; -template <> -struct Kernel -{ - DEFINE_STRATEGY(gemm_s8_4x4) -}; -template <> -struct Kernel -{ - DEFINE_STRATEGY(gemm_u8_4x4) -}; +public: + // Inherited methods overridden + std::unique_ptr instantiate_prepareB(const ITensor *b, + ITensor *transformed_b, + const INEGEMMWrapperKernel::Params ¶ms, + const CPUInfo &ci) override + { + auto prepare_b = support::cpp14::make_unique>(); + prepare_b->configure(b, transformed_b, false, ci, params); + return std::move(prepare_b); + } + std::unique_ptr instantiate_transformA(const ITensor *a, + ITensor *transformed_a, + const Window &block_walker, + const INEGEMMWrapperKernel::Params ¶ms) override + { + auto transform_a = support::cpp14::make_unique>(); + transform_a->configure(a, transformed_a, false, block_walker, params); + return std::move(transform_a); + } + std::unique_ptr instantiate_matrix_multiply(const ITensor *transformed_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, + const Window &block_walker, const BlockSizes &block_sizes, + const INEGEMMWrapperKernel::Params ¶ms, float alpha, float beta, bool pretranspose_b, + unsigned int num_threads) override + { + auto matrix_multiply = support::cpp14::make_unique>(); + matrix_multiply->configure(transformed_a, transformed_b, tmp_c, c, block_walker, block_sizes, params, pretranspose_b, alpha, beta, num_threads); + return std::move(matrix_multiply); + } -//Use different strategies for 8bit dot product: -template <> -struct Kernel -{ - DEFINE_STRATEGY_SUFFIX(gemm_s8_12x8, "_dot") + BlockSizes calculate_block_sizes_for_strategy(const CPUInfo &ci, const INEGEMMWrapperKernel::Params ¶ms) override + { + return calculate_block_sizes(ci, params.M, params.N, params.K); + } }; -template <> -struct Kernel -{ - DEFINE_STRATEGY_SUFFIX(gemm_u8_12x8, "_dot") -}; -#else -template <> -struct Kernel -{ - DEFINE_STRATEGY(sgemm_8x6) -}; -#endif /* __aarch64__ */ -#endif /* __ARM_FEATURE_SVE */ - -#undef DEFINE_STRATEGY -#undef DEFINE_STRATEGY_SUFFIX -} // namespace +/** Create the backend GEMM strategy to use given the provided kernel info + * + * @param[in] kernel_name Kernel name of the backend strategy to instantiate + * + * @return The requested kernel strategy if exists else nullptr + */ +std::unique_ptr create_strategy(const std::string &kernel_name) +{ +#if defined(__arm__) + if(kernel_name.find("sgemm_8x6") != std::string::npos) + { + return support::cpp14::make_unique>(); + } +#endif // defined(__arm__) +#if defined(__aarch64__) + if(kernel_name.find("gemm_s8_4x4") != std::string::npos) + { + return support::cpp14::make_unique>(); + } + if(kernel_name.find("gemm_s8_12x8") != std::string::npos) + { + return support::cpp14::make_unique>(); + } + if(kernel_name.find("gemm_u8_4x4") != std::string::npos) + { + return support::cpp14::make_unique>(); + } + if(kernel_name.find("gemm_u8_12x8") != std::string::npos) + { + return support::cpp14::make_unique>(); + } +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + if(kernel_name.find("hgemm_24x8") != std::string::npos) + { + return support::cpp14::make_unique>(); + } +#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + if(kernel_name.find("sgemm_12x8") != std::string::npos) + { + return support::cpp14::make_unique>(); + } +#if defined(__ARM_FEATURE_SVE) + if(kernel_name.find("interleaved_fp16_mla_3VLx8") != std::string::npos) + { + return support::cpp14::make_unique>(); + } + if(kernel_name.find("interleaved_fp32_mla_3VLx8") != std::string::npos) + { + return support::cpp14::make_unique>(); + } + if(kernel_name.find("interleaved_s8s32_dot_3VLx8") != std::string::npos) + { + return support::cpp14::make_unique>(); + } + if(kernel_name.find("interleaved_u8u32_dot_3VLx8") != std::string::npos) + { + return support::cpp14::make_unique>(); + } +#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(__aarch64__)_ + return nullptr; +} +} // namespace detail } // namespace arm_compute #endif /* __ARM_COMPUTE_NEGEMMINTERLEAVEDSTRATEGIES_H__ */ diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp deleted file mode 100644 index 3b80a1f940..0000000000 --- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h" - -#include "NEGEMMInterleavedStrategies.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/WindowIterator.h" - -#include "utils/TypePrinter.h" - -namespace arm_compute -{ -template -void NEGEMMInterleavedTransformAWrapperTemplate::configure(const ITensor *a, ITensor *transformed_a, bool transpose_a, const Window &block_walker, - const INEGEMMWrapperKernel::Params ¶ms) -{ - _a = a; - _transformed_a = transformed_a; - _transpose_a = transpose_a; - _Ksize = params.K; - _Msize = params.M; - _k_multi_window = block_walker.shift_dimensions(1); // block_walker contains (M,K,Multi) --> shift by 1 to get rid of the "M" dimension -} - -template -void NEGEMMInterleavedTransformAWrapperTemplate::transform(const TransformAWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset, - const Coordinates &end_offset) -{ - using strategy = typename Kernel::strategy; - - strategy strat(info.cpu_info); - TensorAccessor a(*_a); - TensorAccessor transformed_a(*_transformed_a); - - if(_a->info()->data_layout() == DataLayout::NHWC) - { - // In the case of NHWC we want to interpret the output shape as 3D. Thus, the batch stride for A is - // the relevant multiple of the row stride. - const size_t nhwc_batch_stride = _a->info()->strides_in_bytes().y() * _Msize; - a.set_stride(2, nhwc_batch_stride); - } - - unsigned int last_m = 0; - //TODO: Create a new iterate_1D( DimY); - int last_y = -1; - auto window_iterator = arm_compute::create_window_iterator(batch_window, start_offset, end_offset, [&](const Coordinates & id) - { - if(id.y() != last_y) - { - last_y = id.y(); - unsigned int batch = id.y(); - unsigned int first_m = id.x(); - - if(first_m >= last_m) - return; - - strat.transforms.PrepareA(transformed_a(0, first_m, batch), - a(0, 0, batch, wl._multi), - a.stride(1), first_m, last_m, wl._k0, wl._kmax, _transpose_a); - } - }); - auto on_new_row_size = [&](unsigned int start, unsigned int end) - { - last_m = std::min(end, _Msize); - }; - window_iterator.iterate_2D(on_new_row_size); -} - -template -void NEGEMMInterleavedTransformAWrapperTemplate::create_workloads(std::vector &workloads) -{ - execute_window_loop(_k_multi_window, [&](const Coordinates & id) - { - const unsigned int k0 = id.x(); - const unsigned int multi = id.y(); - const unsigned int kmax = std::min(k0 + _k_multi_window.x().step(), _Ksize); - - workloads.push_back(TransformAWorkload(k0, kmax, multi)); - }); -} - -template class NEGEMMInterleavedTransformAWrapperTemplate; -#ifdef __aarch64__ -template class NEGEMMInterleavedTransformAWrapperTemplate; -template class NEGEMMInterleavedTransformAWrapperTemplate; -template class NEGEMMInterleavedTransformAWrapperTemplate; -template class NEGEMMInterleavedTransformAWrapperTemplate; -#endif /* __aarch64__ */ - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template class NEGEMMInterleavedTransformAWrapperTemplate; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -} // namespace arm_compute diff --git a/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp b/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp index e452dfbcf2..7b1f3e7ba0 100644 --- a/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp +++ b/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -34,11 +34,7 @@ #include "../arm_gemm/mergeresults.hpp" #include "../arm_gemm/transform.hpp" -#include "../arm_gemm/kernels/a32_sgemm_8x6.hpp" -#include "../arm_gemm/kernels/a64_sgemm_12x8.hpp" #include "../arm_gemm/kernels/a64_sgemm_native_16x4.hpp" -#include "../arm_gemm/kernels/a64_sgemv_pretransposed.hpp" -#include "../arm_gemm/kernels/a64_sgemv_trans.hpp" namespace arm_compute { diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp index 25be4a5349..cd614ba582 100644 --- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp +++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -24,9 +24,6 @@ #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h" #include "arm_compute/core/CPP/Validate.h" -#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h" -#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h" -#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h" #include "arm_compute/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h" @@ -38,14 +35,14 @@ namespace arm_compute { namespace { -std::unique_ptr create_function_all_types(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint, +std::unique_ptr create_function_all_types(arm_gemm::KernelDescription gemm_kernel_info, + const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint, std::shared_ptr memory_manager) { //Note: It's safe to not check for FP16 support because this was already checked in NEGEMMAssemblyDispatch::configure() - switch(method) + switch(gemm_kernel_info.method) { - case arm_gemm::GemmMethod::GEMM_INTERLEAVED_FP16: case arm_gemm::GemmMethod::GEMM_INTERLEAVED: { if(!pretranspose_hint) @@ -56,92 +53,24 @@ std::unique_ptr create_function_all_types(arm_gemm::GemmMethod method function->configure(a, b, d, alpha, beta, pretranspose_hint); return std::move(function); } - default: - return nullptr; - } -} - -template -std::unique_ptr create_function(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint, - std::shared_ptr memory_manager) -{ - ARM_COMPUTE_UNUSED(method); - ARM_COMPUTE_UNUSED(a); - ARM_COMPUTE_UNUSED(b); - ARM_COMPUTE_UNUSED(d); - ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_UNUSED(beta); - ARM_COMPUTE_UNUSED(pretranspose_hint); - ARM_COMPUTE_UNUSED(memory_manager); - return nullptr; -} - -#ifdef __aarch64__ -template <> -std::unique_ptr create_function(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint, - std::shared_ptr memory_manager) -{ - switch(method) - { - case arm_gemm::GemmMethod::GEMM_INTERLEAVED_DOT: - { - if(!pretranspose_hint) - { - return nullptr; - } - auto function = support::cpp14::make_unique(memory_manager); - function->configure(a, b, d, alpha, beta, pretranspose_hint, true /* use_dot */); - return std::move(function); - } - default: - return nullptr; - } - return nullptr; -} - -template <> -std::unique_ptr create_function(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint, - std::shared_ptr memory_manager) -{ - switch(method) - { - case arm_gemm::GemmMethod::GEMM_INTERLEAVED_DOT: +#if defined(__aarch64__) + case arm_gemm::GemmMethod::GEMM_NATIVE: { - if(!pretranspose_hint) + if(gemm_kernel_info.name.find("sgemm_native_16x4") != std::string::npos) { - return nullptr; + auto kernel = support::cpp14::make_unique>(); + kernel->configure(a, b, d, alpha, beta); + auto function = support::cpp14::make_unique(); + function->configure(std::move(kernel)); + return std::move(function); } - auto function = support::cpp14::make_unique(memory_manager); - function->configure(a, b, d, alpha, beta, pretranspose_hint, true /* use_dot */); - return std::move(function); - } - default: return nullptr; - } - return nullptr; -} - -template <> -std::unique_ptr create_function(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint, - std::shared_ptr memory_manager) -{ - ARM_COMPUTE_UNUSED(pretranspose_hint); - ARM_COMPUTE_UNUSED(memory_manager); - switch(method) - { - case arm_gemm::GemmMethod::GEMM_NATIVE: - { - auto kernel = support::cpp14::make_unique>(); - kernel->configure(a, b, d, alpha, beta); - auto function = support::cpp14::make_unique(); - function->configure(std::move(kernel)); - return std::move(function); } +#endif // defined(__aarch64__) default: return nullptr; } } -#endif /* __aarch64__ */ /** Fallback in case ACL doesn't have a function */ template @@ -189,7 +118,7 @@ private: template void Fallback::configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs &args, MemoryGroup &memory_group) { - _gemm_kernel_asm = arm_gemm::gemm(args, nullptr); + _gemm_kernel_asm = arm_gemm::gemm(args); if(_gemm_kernel_asm == nullptr) { //configuration not supported: Leave function unconfigured: @@ -334,12 +263,8 @@ void create_function_or_arm_gemm(std::unique_ptr &acl_function, std:: arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint); //Try to create an ACL function: - acl_function = create_function_all_types(arm_gemm::get_gemm_method(args), a, b, d, alpha, beta, pretranspose_hint, memory_manager); - // If the type agnostic factory failed to create an ACL function, try the specialised one: - if(acl_function == nullptr) - { - acl_function = create_function(arm_gemm::get_gemm_method(args), a, b, d, alpha, beta, pretranspose_hint, memory_manager); - } + acl_function = create_function_all_types(arm_gemm::get_gemm_method(args), a, b, d, alpha, beta, pretranspose_hint, std::move(memory_manager)); + //If we still don't have an ACL function: if(acl_function == nullptr) { diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp index fe998a0e42..695fc859de 100644 --- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp +++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -26,12 +26,11 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/NEON/kernels/assembly/Helpers.h" -#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h" -#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h" -#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h" #include "arm_compute/core/Utils.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h" + #include #include #include @@ -179,6 +178,7 @@ NEGEMMInterleavedWrapper::NEGEMMInterleavedWrapper(std::shared_ptr -std::unique_ptr instantiate_prepareB(const ITensor *b, ITensor *transformed_b, const INEGEMMWrapperKernel::Params ¶ms) -{ - auto prepare_b = support::cpp14::make_unique>(); - prepare_b->configure(b, transformed_b, false, NEScheduler::get().cpu_info(), params); - return std::move(prepare_b); -} - -// Factory to instantiate NEGEMMInterleavedTransformAWrapperTemplate: -template -std::unique_ptr instantiate_transformA(const ITensor *a, ITensor *transformed_a, const Window &block_walker, const INEGEMMWrapperKernel::Params ¶ms) -{ - auto transform_a = support::cpp14::make_unique>(); - transform_a->configure(a, transformed_a, false, block_walker, params); - return std::move(transform_a); -} - -// Factory to instantiate NEGEMMInterleavedTransformAWrapperTemplate: -template -std::unique_ptr instantiate_matrix_multiply(const ITensor *transformed_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker, - const BlockSizes &block_sizes, const INEGEMMWrapperKernel::Params ¶ms, bool pretranspose_b, float alpha, float beta) -{ - auto matrix_multiply = support::cpp14::make_unique>(); - matrix_multiply->configure(transformed_a, transformed_b, tmp_c, c, block_walker, block_sizes, params, pretranspose_b, alpha, beta, NEScheduler::get().num_threads()); - return std::move(matrix_multiply); -} -} // namespace - -void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, bool pretranspose_b, bool use_dot) +void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, bool pretranspose_b) { _params = INEGEMMWrapperKernel::extract_parameters(a, b, c); _a = a; @@ -373,18 +342,26 @@ void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITe _c = c; _pretranspose_b = pretranspose_b; - DataType input_type = a->info()->data_type(); + const DataType input_type = a->info()->data_type(); + const CPUInfo &ci = NEScheduler::get().cpu_info(); + const unsigned int num_threads = NEScheduler::get().num_threads(); + + const arm_gemm::KernelDescription gemm_kernel_info = get_gemm_info(input_type, ci, num_threads, _params, alpha, beta, pretranspose_b); + ARM_COMPUTE_ERROR_ON(gemm_kernel_info.method != arm_gemm::GemmMethod::GEMM_INTERLEAVED); // Forcing 128-byte alignment (required by 32-bit kernels) const unsigned int alignment = 128; _transformed_b.allocator()->init(TensorInfo{}, alignment); _tmp_c.allocator()->init(TensorInfo{}, alignment); - _tag = "NEGEMMInterleaved_"; - _tag += get_strategy_name(input_type, use_dot); + _tag = "NEGEMMInterleaved_" + gemm_kernel_info.name; + + // Get strategy + std::unique_ptr strategy = detail::create_strategy(gemm_kernel_info.name); + ARM_COMPUTE_ERROR_ON(strategy == nullptr); if(!_pretranspose_b) { - _block_sizes = calculate_block_sizes_from_data_type(NEScheduler::get().cpu_info(), _params.M, _params.N, _params.K, input_type, use_dot); + _block_sizes = strategy->calculate_block_sizes_for_strategy(ci, _params); _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height)); _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches)); // If the execution is single threaded or has only one window then the buffer manager only needs 1 buffer else we will use NUM_BUFFERS buffers and ping pong between them: @@ -409,43 +386,8 @@ void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITe { _tag += "_preB"; } - switch(input_type) - { - case DataType::F32: - _prepare_b = instantiate_prepareB(_b, &_transformed_b, _params); - break; -#ifdef __aarch64__ - case DataType::U8: - case DataType::QASYMM8: - if(use_dot) - { - _prepare_b = instantiate_prepareB(_b, &_transformed_b, _params); - } - else - { - _prepare_b = instantiate_prepareB(_b, &_transformed_b, _params); - } - break; - case DataType::S8: - if(use_dot) - { - _prepare_b = instantiate_prepareB(_b, &_transformed_b, _params); - } - else - { - _prepare_b = instantiate_prepareB(_b, &_transformed_b, _params); - } - break; -#endif /* __aarch64__ */ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - _prepare_b = instantiate_prepareB<__fp16>(_b, &_transformed_b, _params); - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - ARM_COMPUTE_ERROR("DataType not supported"); - break; - } + + _prepare_b = strategy->instantiate_prepareB(b, &_transformed_b, _params, ci); ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr); if(_pretranspose_b) @@ -463,51 +405,11 @@ void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITe _memory_group.manage(&_transformed_a); _memory_group.manage(&_tmp_c); - switch(input_type) - { - case DataType::F32: - _transform_a = instantiate_transformA(_a, &_transformed_a, _block_walker, _params); - _matrix_multiply = instantiate_matrix_multiply(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta); - break; -#ifdef __aarch64__ - case DataType::U8: - case DataType::QASYMM8: - if(use_dot) - { - _transform_a = instantiate_transformA(_a, &_transformed_a, _block_walker, _params); - _matrix_multiply = instantiate_matrix_multiply(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta); - } - else - { - _transform_a = instantiate_transformA(_a, &_transformed_a, _block_walker, _params); - _matrix_multiply = instantiate_matrix_multiply(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta); - } - break; - case DataType::S8: - if(use_dot) - { - _transform_a = instantiate_transformA(_a, &_transformed_a, _block_walker, _params); - _matrix_multiply = instantiate_matrix_multiply(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta); - } - else - { - _transform_a = instantiate_transformA(_a, &_transformed_a, _block_walker, _params); - _matrix_multiply = instantiate_matrix_multiply(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta); - } - break; -#endif /* __aarch64__ */ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - _transform_a = instantiate_transformA<__fp16>(_a, &_transformed_a, _block_walker, _params); - _matrix_multiply = instantiate_matrix_multiply<__fp16, __fp16>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta); - break; - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - break; - } + _transform_a = strategy->instantiate_transformA(_a, &_transformed_a, _block_walker, _params); + _matrix_multiply = strategy->instantiate_matrix_multiply(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, alpha, beta, pretranspose_b, num_threads); ARM_COMPUTE_ERROR_ON(_transform_a == nullptr); ARM_COMPUTE_ERROR_ON(_matrix_multiply == nullptr); + _transformed_a.allocator()->allocate(); _tmp_c.allocator()->allocate(); if(!_pretranspose_b) -- cgit v1.2.1